From 89029cb02d3f0371d5dbc8939e43450e2183c314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 14:16:54 +0100 Subject: [PATCH 01/49] Draft of changes introducing timestamping --- .../host_objects/JsiConversions.h | 14 ++ .../models/speech_to_text/SpeechToText.cpp | 127 +++++++++++-- .../stream/OnlineASRProcessor.cpp | 32 ++-- .../stream/OnlineASRProcessor.h | 5 +- .../useSpeechToText.ts | 171 +++++++++++++++++- .../SpeechToTextModule.ts | 24 +-- 6 files changed, 319 insertions(+), 54 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 2baf922db..570ba0939 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -62,6 +62,20 @@ getValue>(const jsi::Value &val, val.asObject(runtime).asFunction(runtime)); } +template <> +inline getValue(const jsi::Value &val, jsi::Runtime &runtime) { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; +}; + template <> inline JSTensorViewIn getValue(const jsi::Value &val, jsi::Runtime &runtime) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 6299c9c40..f026f30a9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -66,11 +66,92 @@ std::vector SpeechToText::transcribe(std::span waveform, return {transcription.begin(), transcription.end()}; } +std::vector SpeechToText::transcribe(std::span waveform, + std::string languageOption) const { + std::vector segments = + this->asr->transcribe(waveform, DecodingOptions(languageOption)); + std::vector transcription; + + size_t transcriptionLength = 0; + for (auto &segment : segments) { + transcriptionLength += segment.words.size(); + } + + transcription.reserve(segments.size()); + + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcription.push_back(word); + } + } + + auto wordsToJsi = [](jsi::Runtime &rt, + const std::vector &words) -> jsi::Value { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; + }; + + return transcription; +} + size_t SpeechToText::getMemoryLowerBound() const noexcept { return this->encoder->getMemoryLowerBound() + this->decoder->getMemoryLowerBound(); } +// void SpeechToText::stream(std::shared_ptr callback, +// std::string languageOption) { +// if (this->isStreaming) { +// throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, +// "Streaming is already in progress!"); +// } + +// auto nativeCallback = +// [this, callback](const std::vector &committedVec, +// const std::vector &nonCommittedVec, bool isDone) +// { +// this->callInvoker->invokeAsync([callback, committedVec, +// nonCommittedVec, +// isDone](jsi::Runtime &rt) { +// callback->call( +// rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, +// rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, +// rt), jsi::Value(isDone)); +// }); +// }; + +// this->isStreaming = true; +// while (this->isStreaming) { +// if (!this->readyToProcess || +// this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) +// { +// std::this_thread::sleep_for(std::chrono::milliseconds(100)); +// continue; +// } +// ProcessResult res = +// this->processor->processIter(DecodingOptions(languageOption)); + +// nativeCallback({res.committed.begin(), res.committed.end()}, +// {res.nonCommitted.begin(), res.nonCommitted.end()}, +// false); +// this->readyToProcess = false; +// } + +// std::string committed = this->processor->finish(); + +// nativeCallback({committed.begin(), committed.end()}, {}, true); + +// this->resetStreamState(); +// } + void SpeechToText::stream(std::shared_ptr callback, std::string languageOption) { if (this->isStreaming) { @@ -78,17 +159,33 @@ void SpeechToText::stream(std::shared_ptr callback, "Streaming is already in progress!"); } - auto nativeCallback = - [this, callback](const std::vector &committedVec, - const std::vector &nonCommittedVec, bool isDone) { - this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, - isDone](jsi::Runtime &rt) { - callback->call( - rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt), - rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt), - jsi::Value(isDone)); - }); - }; + auto wordsToJsi = [](jsi::Runtime &rt, + const std::vector &words) -> jsi::Value { + jsi::Array jsiArr(rt, words.size()); + for (size_t i = 0; i < words.size(); ++i) { + jsi::Object obj(rt); + obj.setProperty(rt, "word", + jsi::String::createFromUtf8(rt, words[i].content)); + obj.setProperty(rt, "start", static_cast(words[i].start)); + obj.setProperty(rt, "end", static_cast(words[i].end)); + jsiArr.setValueAtIndex(rt, i, obj); + } + return jsiArr; + }; + + auto nativeCallback = [this, callback, + wordsToJsi](const std::vector &committedVec, + const std::vector &nonCommittedVec, + bool isDone) { + this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, + isDone, wordsToJsi](jsi::Runtime &rt) { + jsi::Value committedJsi = wordsToJsi(rt, committedVec); + jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec); + + callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi), + jsi::Value(isDone)); + }); + }; this->isStreaming = true; while (this->isStreaming) { @@ -100,14 +197,14 @@ void SpeechToText::stream(std::shared_ptr callback, ProcessResult res = this->processor->processIter(DecodingOptions(languageOption)); - nativeCallback({res.committed.begin(), res.committed.end()}, - {res.nonCommitted.begin(), res.nonCommitted.end()}, false); + nativeCallback(res.committed, res.nonCommitted, false); this->readyToProcess = false; } - std::string committed = this->processor->finish(); + // finish() now returns std::vector + std::vector committed = this->processor->finish(); - nativeCallback({committed.begin(), committed.end()}, {}, true); + nativeCallback(committed, {}, true); this->resetStreamState(); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index c6a99e9a2..b8a7aced4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -77,23 +77,27 @@ void OnlineASRProcessor::chunkAt(float time) { this->bufferTimeOffset = time; } -std::string OnlineASRProcessor::finish() { - const std::deque buffer = this->hypothesisBuffer.complete(); - std::string committedText = this->toFlush(buffer); +std::vector OnlineASRProcessor::finish() { + std::deque bufferDeq = this->hypothesisBuffer.complete(); + std::vector buffer(std::make_move_iterator(bufferDeq.begin()), + std::make_move_iterator(bufferDeq.end())); + + // std::string committedText = this->toFlush(buffer); this->bufferTimeOffset += static_cast(audioBuffer.size()) / OnlineASRProcessor::kSamplingRate; - return committedText; + return buffer; } -std::string OnlineASRProcessor::toFlush(const std::deque &words) const { - std::string text; - text.reserve(std::accumulate( - words.cbegin(), words.cend(), 0, - [](size_t sum, const Word &w) { return sum + w.content.size(); })); - for (const auto &word : words) { - text.append(word.content); - } - return text; -} +// std::string OnlineASRProcessor::toFlush(const std::deque &words) const +// { +// std::string text; +// text.reserve(std::accumulate( +// words.cbegin(), words.cend(), 0, +// [](size_t sum, const Word &w) { return sum + w.content.size(); })); +// for (const auto &word : words) { +// text.append(word.content); +// } +// return text; +// } } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h index c50b56271..720e6bf76 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h @@ -12,7 +12,8 @@ class OnlineASRProcessor { void insertAudioChunk(std::span audio); types::ProcessResult processIter(const types::DecodingOptions &options); - std::string finish(); + // std::string finish(); + std::vector finish(); std::vector audioBuffer; @@ -27,7 +28,7 @@ class OnlineASRProcessor { void chunkCompletedSegment(std::span res); void chunkAt(float time); - std::string toFlush(const std::deque &words) const; + // std::string toFlush(const std::deque &words) const; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 74734c35e..8285c918d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -1,3 +1,141 @@ +// import { useEffect, useCallback, useState } from 'react'; +// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule'; +// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; +// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; +// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; + +// export const useSpeechToText = ({ +// model, +// preventLoad = false, +// }: { +// model: SpeechToTextModelConfig; +// preventLoad?: boolean; +// }) => { +// const [error, setError] = useState(null); +// const [isReady, setIsReady] = useState(false); +// const [isGenerating, setIsGenerating] = useState(false); +// const [downloadProgress, setDownloadProgress] = useState(0); + +// const [modelInstance] = useState(() => new SpeechToTextModule()); +// const [committedTranscription, setCommittedTranscription] = useState(Word); +// const [nonCommittedTranscription, setNonCommittedTranscription] = +// useState(Word); + +// useEffect(() => { +// if (preventLoad) return; +// (async () => { +// setDownloadProgress(0); +// setError(null); +// try { +// setIsReady(false); +// await modelInstance.load( +// { +// isMultilingual: model.isMultilingual, +// encoderSource: model.encoderSource, +// decoderSource: model.decoderSource, +// tokenizerSource: model.tokenizerSource, +// }, +// setDownloadProgress +// ); +// setIsReady(true); +// } catch (err) { +// setError(parseUnknownError(err)); +// } +// })(); +// }, [ +// modelInstance, +// model.isMultilingual, +// model.encoderSource, +// model.decoderSource, +// model.tokenizerSource, +// preventLoad, +// ]); + +// const stateWrapper = useCallback( +// Promise>(fn: T) => +// async (...args: Parameters): Promise>> => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); +// try { +// return await fn.apply(modelInstance, args); +// } finally { +// setIsGenerating(false); +// } +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const stream = useCallback( +// async (options?: DecodingOptions) => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// if (isGenerating) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModelGenerating, +// 'The model is currently generating. Please wait until previous model run is complete.' +// ); +// setIsGenerating(true); +// setCommittedTranscription(''); +// setNonCommittedTranscription(''); +// let transcription = ''; +// try { +// for await (const { committed, nonCommitted } of modelInstance.stream( +// options +// )) { +// setCommittedTranscription((prev) => prev + committed); +// setNonCommittedTranscription(nonCommitted); +// transcription += committed; +// } +// } finally { +// setIsGenerating(false); +// } +// return transcription; +// }, +// [isReady, isGenerating, modelInstance] +// ); + +// const wrapper = useCallback( +// any>(fn: T) => { +// return (...args: Parameters): ReturnType => { +// if (!isReady) +// throw new RnExecutorchError( +// RnExecutorchErrorCode.ModuleNotLoaded, +// 'The model is currently not loaded. Please load the model before calling this function.' +// ); +// return fn.apply(modelInstance, args); +// }; +// }, +// [isReady, modelInstance] +// ); + +// return { +// error, +// isReady, +// isGenerating, +// downloadProgress, +// committedTranscription, +// nonCommittedTranscription, +// encode: stateWrapper(SpeechToTextModule.prototype.encode), +// decode: stateWrapper(SpeechToTextModule.prototype.decode), +// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), +// stream, +// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), +// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), +// }; +// }; + import { useEffect, useCallback, useState } from 'react'; import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule'; import { @@ -25,9 +163,14 @@ export const useSpeechToText = ({ const [downloadProgress, setDownloadProgress] = useState(0); const [modelInstance] = useState(() => new SpeechToTextModule()); - const [committedTranscription, setCommittedTranscription] = useState(''); - const [nonCommittedTranscription, setNonCommittedTranscription] = - useState(''); + + // FIX 1: Initialize with empty array [], generic type Word[] + const [committedTranscription, setCommittedTranscription] = useState( + [] + ); + const [nonCommittedTranscription, setNonCommittedTranscription] = useState< + Word[] + >([]); useEffect(() => { if (preventLoad) return; @@ -95,21 +238,31 @@ export const useSpeechToText = ({ 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); - setCommittedTranscription(''); - setNonCommittedTranscription(''); - let transcription = ''; + + // FIX 2: Reset to empty arrays + setCommittedTranscription([]); + setNonCommittedTranscription([]); + + // Accumulator is now an array of Words, not a string + const fullResult: Word[] = []; + try { for await (const { committed, nonCommitted } of modelInstance.stream( options )) { - setCommittedTranscription((prev) => prev + committed); + // FIX 3: Update state by appending arrays + if (committed.length > 0) { + setCommittedTranscription((prev) => [...prev, ...committed]); + fullResult.push(...committed); + } + + // nonCommitted is always a fresh partial chunk setNonCommittedTranscription(nonCommitted); - transcription += committed; } } finally { setIsGenerating(false); } - return transcription; + return fullResult; }, [isReady, isGenerating, modelInstance] ); diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 4b4f196df..803e4146a 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -10,13 +10,9 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; */ export class SpeechToTextModule { private nativeModule: any; - private modelConfig!: SpeechToTextModelConfig; - private textDecoder = new TextDecoder('utf-8', { - fatal: false, - ignoreBOM: true, - }); + // 2. TextDecoder is removed as C++ now returns JS objects directly /** * Loads the model specified by the config object. @@ -105,13 +101,14 @@ export class SpeechToTextModule { public async transcribe( waveform: Float32Array, options: DecodingOptions = {} - ): Promise { + ): Promise { this.validateOptions(options); const transcriptionBytes = await this.nativeModule.transcribe( waveform, options.language || '' ); - return this.textDecoder.decode(new Uint8Array(transcriptionBytes)); + + return transcriptionBytes; } /** @@ -128,10 +125,10 @@ export class SpeechToTextModule { */ public async *stream( options: DecodingOptions = {} - ): AsyncGenerator<{ committed: string; nonCommitted: string }> { + ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> { this.validateOptions(options); - const queue: { committed: string; nonCommitted: string }[] = []; + const queue: { committed: Word[]; nonCommitted: Word[] }[] = []; let waiter: (() => void) | null = null; let finished = false; let error: unknown; @@ -144,12 +141,11 @@ export class SpeechToTextModule { (async () => { try { await this.nativeModule.stream( - (committed: number[], nonCommitted: number[], isDone: boolean) => { + // Callback now receives arrays of objects directly + (committed: Word[], nonCommitted: Word[], isDone: boolean) => { queue.push({ - committed: this.textDecoder.decode(new Uint8Array(committed)), - nonCommitted: this.textDecoder.decode( - new Uint8Array(nonCommitted) - ), + committed, + nonCommitted, }); if (isDone) { finished = true; From 995d81f43c17fbe92877ad24f7b6e3e05259157a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 14:44:24 +0100 Subject: [PATCH 02/49] Add missing headers --- .../common/rnexecutorch/host_objects/JsiConversions.h | 1 + .../common/rnexecutorch/host_objects/ModelHostObject.h | 1 + 2 files changed, 2 insertions(+) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 570ba0939..bf1147162 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -18,6 +18,7 @@ #include #include #include +#include #include namespace rnexecutorch::jsi_conversion { diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index a1ce8e8e8..815964aed 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include From 27910a4fc66762a1c074c9bcbc356cb73377a96b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Tue, 20 Jan 2026 21:59:21 +0100 Subject: [PATCH 03/49] Add draft of working version for timestamps only --- apps/speech/screens/SpeechToTextScreen.tsx | 325 +++++++++++++++++- .../host_objects/JsiConversions.h | 49 ++- .../host_objects/ModelHostObject.h | 2 + .../models/speech_to_text/SpeechToText.cpp | 40 +-- .../stream/OnlineASRProcessor.cpp | 6 +- .../speech_to_text/types/ProcessResult.h | 9 +- 6 files changed, 387 insertions(+), 44 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index da7ed0f7e..542d5dd01 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -1,3 +1,300 @@ +// import React, { useEffect, useRef, useState } from 'react'; +// import { +// Text, +// View, +// StyleSheet, +// TouchableOpacity, +// ScrollView, +// TextInput, +// KeyboardAvoidingView, +// Platform, +// } from 'react-native'; +// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +// import FontAwesome from '@expo/vector-icons/FontAwesome'; +// import { +// AudioManager, +// AudioRecorder, +// AudioContext, +// } from 'react-native-audio-api'; +// import * as FileSystem from 'expo-file-system/legacy'; +// import SWMIcon from '../assets/swm_icon.svg'; +// import DeviceInfo from 'react-native-device-info'; + +// const isSimulator = DeviceInfo.isEmulatorSync(); + +// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { +// const model = useSpeechToText({ +// model: WHISPER_TINY_EN, +// }); + +// const [transcription, setTranscription] = useState(''); +// const [audioURL, setAudioURL] = useState(''); +// const [liveTranscribing, setLiveTranscribing] = useState(false); +// const scrollViewRef = useRef(null); + +// const [recorder] = useState( +// () => +// new AudioRecorder({ +// sampleRate: 16000, +// bufferLengthInSamples: 1600, +// }) +// ); + +// useEffect(() => { +// AudioManager.setAudioSessionOptions({ +// iosCategory: 'playAndRecord', +// iosMode: 'spokenAudio', +// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], +// }); +// AudioManager.requestRecordingPermissions(); +// }, []); + +// const handleTranscribeFromURL = async () => { +// if (!audioURL.trim()) { +// console.warn('Please provide a valid audio file URL'); +// return; +// } + +// const { uri } = await FileSystem.downloadAsync( +// audioURL, +// FileSystem.cacheDirectory + 'audio_file' +// ); + +// const audioContext = new AudioContext({ sampleRate: 16000 }); + +// try { +// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); +// const audioBuffer = decodedAudioData.getChannelData(0); +// setTranscription(await model.transcribe(audioBuffer)); +// } catch (error) { +// console.error('Error decoding audio data', error); +// console.warn('Note: Supported file formats: mp3, wav, flac'); +// return; +// } +// }; + +// const handleStartTranscribeFromMicrophone = async () => { +// setLiveTranscribing(true); +// setTranscription(''); +// recorder.onAudioReady(({ buffer }) => { +// model.streamInsert(buffer.getChannelData(0)); +// }); +// recorder.start(); + +// try { +// await model.stream(); +// } catch (error) { +// console.error('Error during live transcription:', error); +// } +// }; + +// const handleStopTranscribeFromMicrophone = () => { +// recorder.stop(); +// model.streamStop(); +// console.log('Live transcription stopped'); +// setLiveTranscribing(false); +// }; + +// const getModelStatus = () => { +// if (model.error) return `${model.error}`; +// if (model.isGenerating) return 'Transcribing...'; +// if (model.isReady) return 'Ready to transcribe'; +// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; +// }; + +// const readyToTranscribe = !model.isGenerating && model.isReady; +// const recordingButtonDisabled = isSimulator || !readyToTranscribe; + +// return ( +// +// +// +// +// +// +// +// +// React Native ExecuTorch +// Speech to Text +// + +// +// Status: {getModelStatus()} +// + +// +// Transcription +// +// scrollViewRef.current?.scrollToEnd({ animated: true }) +// } +// > +// +// {transcription !== '' +// ? transcription +// : model.committedTranscription + +// model.nonCommittedTranscription} +// +// +// + +// +// +// +// +// Start +// +// + +// {liveTranscribing ? ( +// +// +// Stop Live Transcription +// +// ) : ( +// +// +// +// {isSimulator +// ? 'Recording is not available on Simulator' +// : 'Start Live Transcription'} +// +// +// )} +// +// +// +// +// ); +// }; + +// const styles = StyleSheet.create({ +// container: { +// flex: 1, +// alignItems: 'center', +// backgroundColor: 'white', +// paddingHorizontal: 16, +// }, +// keyboardAvoidingView: { +// flex: 1, +// width: '100%', +// }, +// header: { +// alignItems: 'center', +// position: 'relative', +// width: '100%', +// }, +// backButton: { +// position: 'absolute', +// left: 0, +// top: 10, +// padding: 10, +// zIndex: 1, +// }, +// headerText: { +// fontSize: 22, +// fontWeight: 'bold', +// color: '#0f186e', +// }, +// statusContainer: { +// marginTop: 12, +// alignItems: 'center', +// }, +// transcriptionContainer: { +// flex: 1, +// width: '100%', +// marginVertical: 12, +// }, +// transcriptionLabel: { +// marginLeft: 12, +// marginBottom: 4, +// color: '#0f186e', +// }, +// transcriptionScrollContainer: { +// borderRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// padding: 12, +// }, +// inputContainer: { +// marginBottom: 12, +// }, +// urlTranscriptionContainer: { +// width: '100%', +// flexDirection: 'row', +// }, +// urlTranscriptionInput: { +// flex: 1, +// padding: 12, +// borderTopLeftRadius: 12, +// borderBottomLeftRadius: 12, +// borderWidth: 1, +// borderColor: '#0f186e', +// borderRightWidth: 0, +// }, +// urlTranscriptionButton: { +// backgroundColor: '#0f186e', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderTopRightRadius: 12, +// borderBottomRightRadius: 12, +// }, +// buttonText: { +// color: 'white', +// fontWeight: '600', +// letterSpacing: -0.5, +// fontSize: 16, +// }, +// liveTranscriptionButton: { +// flexDirection: 'row', +// justifyContent: 'center', +// alignItems: 'center', +// padding: 12, +// borderRadius: 12, +// marginTop: 12, +// gap: 8, +// }, +// backgroundRed: { +// backgroundColor: 'red', +// }, +// backgroundBlue: { +// backgroundColor: '#0f186e', +// }, +// disabled: { +// opacity: 0.5, +// }, +// }); + import React, { useEffect, useRef, useState } from 'react'; import { Text, @@ -10,7 +307,12 @@ import { Platform, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { + useSpeechToText, + WHISPER_TINY_EN, + // Make sure Word is exported from your module + Word, +} from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { AudioManager, @@ -28,7 +330,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + // CHANGE 1: Update state to hold Word[] instead of string + const [transcription, setTranscription] = useState([]); + const [audioURL, setAudioURL] = useState(''); const [liveTranscribing, setLiveTranscribing] = useState(false); const scrollViewRef = useRef(null); @@ -78,6 +382,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); + // model.transcribe now returns Word[], which matches our state type setTranscription(await model.transcribe(audioBuffer)); } catch (error) { console.error('Error decoding audio data', error); @@ -88,7 +393,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - setTranscription(''); + setTranscription([]); // Reset to empty array recorder.onAudioReady(({ buffer }) => { model.streamInsert(buffer.getChannelData(0)); }); @@ -118,6 +423,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; + // CHANGE 3: Prepare the text for rendering + const displayedText = + transcription.length > 0 + ? getText(transcription) + : getText(model.committedTranscription) + + getText(model.nonCommittedTranscription); + return ( @@ -147,12 +459,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { scrollViewRef.current?.scrollToEnd({ animated: true }) } > - - {transcription !== '' - ? transcription - : model.committedTranscription + - model.nonCommittedTranscription} - + {displayedText} diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index bf1147162..184b66a4f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -21,6 +21,8 @@ #include #include +using rnexecutorch::models::speech_to_text::types::Word; + namespace rnexecutorch::jsi_conversion { using namespace facebook; @@ -64,18 +66,24 @@ getValue>(const jsi::Value &val, } template <> -inline getValue(const jsi::Value &val, jsi::Runtime &runtime) { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); - } - return jsiArr; -}; +inline Word getValue(const jsi::Value &val, jsi::Runtime &runtime) { + jsi::Object obj = val.asObject(runtime); + + // 1. Extract the string "word" using the existing string helper + std::string content = getValue(obj.getProperty(runtime, "word"), runtime); + + // 2. Extract start/end times + // We use .asNumber() directly as these are primitives + double start = obj.getProperty(runtime, "start").asNumber(); + double end = obj.getProperty(runtime, "end").asNumber(); + + // 3. Construct and return the C++ Word struct + return Word{ + .content = std::move(content), + .start = static_cast(start), + .end = static_cast(end) + }; +} template <> inline JSTensorViewIn getValue(const jsi::Value &val, @@ -311,6 +319,23 @@ inline jsi::Value getJsiValue(std::shared_ptr valuePtr, return std::move(*valuePtr); } +inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) { + jsi::Object obj(runtime); + obj.setProperty(runtime, "word", jsi::String::createFromUtf8(runtime, word.content)); + obj.setProperty(runtime, "start", static_cast(word.start)); + obj.setProperty(runtime, "end", static_cast(word.end)); + return obj; +} + +inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { + jsi::Array array(runtime, vec.size()); + for (size_t i = 0; i < vec.size(); ++i) { + // Convert each Word using the helper above and place in array + array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime)); + } + return {runtime, array}; +} + inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { jsi::Array array(runtime, vec.size()); diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 815964aed..38210d4cb 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -26,6 +26,8 @@ #include #include +using rnexecutorch::models::speech_to_text::types::Word; + namespace rnexecutorch { template class ModelHostObject : public JsiHostObject { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index f026f30a9..a6ec82795 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -43,28 +43,28 @@ SpeechToText::decode(std::span tokens, return std::make_shared(decoderOutput); } -std::vector SpeechToText::transcribe(std::span waveform, - std::string languageOption) const { - std::vector segments = - this->asr->transcribe(waveform, DecodingOptions(languageOption)); - std::string transcription; - - size_t transcriptionLength = 0; - for (auto &segment : segments) { - for (auto &word : segment.words) { - transcriptionLength += word.content.size(); - } - } - transcription.reserve(transcriptionLength); +// std::vector SpeechToText::transcribe(std::span waveform, +// std::string languageOption) const { +// std::vector segments = +// this->asr->transcribe(waveform, DecodingOptions(languageOption)); +// std::string transcription; + +// size_t transcriptionLength = 0; +// for (auto &segment : segments) { +// for (auto &word : segment.words) { +// transcriptionLength += word.content.size(); +// } +// } +// transcription.reserve(transcriptionLength); - for (auto &segment : segments) { - for (auto &word : segment.words) { - transcription += word.content; - } - } +// for (auto &segment : segments) { +// for (auto &word : segment.words) { +// transcription += word.content; +// } +// } - return {transcription.begin(), transcription.end()}; -} +// return {transcription.begin(), transcription.end()}; +// } std::vector SpeechToText::transcribe(std::span waveform, std::string languageOption) const { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index b8a7aced4..f62986b72 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -35,7 +35,11 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) { } std::deque nonCommittedWords = this->hypothesisBuffer.complete(); - return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; + // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; + return {std::vector(std::make_move_iterator(flushed.begin()), + std::make_move_iterator(flushed.end())), + std::vector(std::make_move_iterator(nonCommittedWords.begin()), + std::make_move_iterator(nonCommittedWords.end()))}; } void OnlineASRProcessor::chunkCompletedSegment(std::span res) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h index 0cb05e5a6..685ba2b76 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h @@ -4,9 +4,14 @@ namespace rnexecutorch::models::speech_to_text::types { +// struct ProcessResult { +// std::string committed; +// std::string nonCommitted; +// }; + struct ProcessResult { - std::string committed; - std::string nonCommitted; + std::vector committed; + std::vector nonCommitted; }; } // namespace rnexecutorch::models::speech_to_text::types From 0dcff40c491f9b506d130e6587e9e414ad067b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 15:36:31 +0100 Subject: [PATCH 04/49] Working version of both timestamping and regular version --- apps/speech/screens/SpeechToTextScreen.tsx | 378 ++++-------------- .../host_objects/ModelHostObject.h | 5 + .../models/speech_to_text/SpeechToText.cpp | 109 +++-- .../models/speech_to_text/SpeechToText.h | 6 +- .../useSpeechToText.ts | 223 +++-------- .../SpeechToTextModule.ts | 62 ++- .../react-native-executorch/src/types/stt.ts | 1 + 7 files changed, 243 insertions(+), 541 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 542d5dd01..1e12163f4 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -1,300 +1,3 @@ -// import React, { useEffect, useRef, useState } from 'react'; -// import { -// Text, -// View, -// StyleSheet, -// TouchableOpacity, -// ScrollView, -// TextInput, -// KeyboardAvoidingView, -// Platform, -// } from 'react-native'; -// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; -// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; -// import FontAwesome from '@expo/vector-icons/FontAwesome'; -// import { -// AudioManager, -// AudioRecorder, -// AudioContext, -// } from 'react-native-audio-api'; -// import * as FileSystem from 'expo-file-system/legacy'; -// import SWMIcon from '../assets/swm_icon.svg'; -// import DeviceInfo from 'react-native-device-info'; - -// const isSimulator = DeviceInfo.isEmulatorSync(); - -// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { -// const model = useSpeechToText({ -// model: WHISPER_TINY_EN, -// }); - -// const [transcription, setTranscription] = useState(''); -// const [audioURL, setAudioURL] = useState(''); -// const [liveTranscribing, setLiveTranscribing] = useState(false); -// const scrollViewRef = useRef(null); - -// const [recorder] = useState( -// () => -// new AudioRecorder({ -// sampleRate: 16000, -// bufferLengthInSamples: 1600, -// }) -// ); - -// useEffect(() => { -// AudioManager.setAudioSessionOptions({ -// iosCategory: 'playAndRecord', -// iosMode: 'spokenAudio', -// iosOptions: ['allowBluetooth', 'defaultToSpeaker'], -// }); -// AudioManager.requestRecordingPermissions(); -// }, []); - -// const handleTranscribeFromURL = async () => { -// if (!audioURL.trim()) { -// console.warn('Please provide a valid audio file URL'); -// return; -// } - -// const { uri } = await FileSystem.downloadAsync( -// audioURL, -// FileSystem.cacheDirectory + 'audio_file' -// ); - -// const audioContext = new AudioContext({ sampleRate: 16000 }); - -// try { -// const decodedAudioData = await audioContext.decodeAudioDataSource(uri); -// const audioBuffer = decodedAudioData.getChannelData(0); -// setTranscription(await model.transcribe(audioBuffer)); -// } catch (error) { -// console.error('Error decoding audio data', error); -// console.warn('Note: Supported file formats: mp3, wav, flac'); -// return; -// } -// }; - -// const handleStartTranscribeFromMicrophone = async () => { -// setLiveTranscribing(true); -// setTranscription(''); -// recorder.onAudioReady(({ buffer }) => { -// model.streamInsert(buffer.getChannelData(0)); -// }); -// recorder.start(); - -// try { -// await model.stream(); -// } catch (error) { -// console.error('Error during live transcription:', error); -// } -// }; - -// const handleStopTranscribeFromMicrophone = () => { -// recorder.stop(); -// model.streamStop(); -// console.log('Live transcription stopped'); -// setLiveTranscribing(false); -// }; - -// const getModelStatus = () => { -// if (model.error) return `${model.error}`; -// if (model.isGenerating) return 'Transcribing...'; -// if (model.isReady) return 'Ready to transcribe'; -// return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; -// }; - -// const readyToTranscribe = !model.isGenerating && model.isReady; -// const recordingButtonDisabled = isSimulator || !readyToTranscribe; - -// return ( -// -// -// -// -// -// -// -// -// React Native ExecuTorch -// Speech to Text -// - -// -// Status: {getModelStatus()} -// - -// -// Transcription -// -// scrollViewRef.current?.scrollToEnd({ animated: true }) -// } -// > -// -// {transcription !== '' -// ? transcription -// : model.committedTranscription + -// model.nonCommittedTranscription} -// -// -// - -// -// -// -// -// Start -// -// - -// {liveTranscribing ? ( -// -// -// Stop Live Transcription -// -// ) : ( -// -// -// -// {isSimulator -// ? 'Recording is not available on Simulator' -// : 'Start Live Transcription'} -// -// -// )} -// -// -// -// -// ); -// }; - -// const styles = StyleSheet.create({ -// container: { -// flex: 1, -// alignItems: 'center', -// backgroundColor: 'white', -// paddingHorizontal: 16, -// }, -// keyboardAvoidingView: { -// flex: 1, -// width: '100%', -// }, -// header: { -// alignItems: 'center', -// position: 'relative', -// width: '100%', -// }, -// backButton: { -// position: 'absolute', -// left: 0, -// top: 10, -// padding: 10, -// zIndex: 1, -// }, -// headerText: { -// fontSize: 22, -// fontWeight: 'bold', -// color: '#0f186e', -// }, -// statusContainer: { -// marginTop: 12, -// alignItems: 'center', -// }, -// transcriptionContainer: { -// flex: 1, -// width: '100%', -// marginVertical: 12, -// }, -// transcriptionLabel: { -// marginLeft: 12, -// marginBottom: 4, -// color: '#0f186e', -// }, -// transcriptionScrollContainer: { -// borderRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// padding: 12, -// }, -// inputContainer: { -// marginBottom: 12, -// }, -// urlTranscriptionContainer: { -// width: '100%', -// flexDirection: 'row', -// }, -// urlTranscriptionInput: { -// flex: 1, -// padding: 12, -// borderTopLeftRadius: 12, -// borderBottomLeftRadius: 12, -// borderWidth: 1, -// borderColor: '#0f186e', -// borderRightWidth: 0, -// }, -// urlTranscriptionButton: { -// backgroundColor: '#0f186e', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderTopRightRadius: 12, -// borderBottomRightRadius: 12, -// }, -// buttonText: { -// color: 'white', -// fontWeight: '600', -// letterSpacing: -0.5, -// fontSize: 16, -// }, -// liveTranscriptionButton: { -// flexDirection: 'row', -// justifyContent: 'center', -// alignItems: 'center', -// padding: 12, -// borderRadius: 12, -// marginTop: 12, -// gap: 8, -// }, -// backgroundRed: { -// backgroundColor: 'red', -// }, -// backgroundBlue: { -// backgroundColor: '#0f186e', -// }, -// disabled: { -// opacity: 0.5, -// }, -// }); - import React, { useEffect, useRef, useState } from 'react'; import { Text, @@ -305,12 +8,12 @@ import { TextInput, KeyboardAvoidingView, Platform, + Switch, // Import Switch } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, WHISPER_TINY_EN, - // Make sure Word is exported from your module Word, } from 'react-native-executorch'; import FontAwesome from '@expo/vector-icons/FontAwesome'; @@ -330,8 +33,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - // CHANGE 1: Update state to hold Word[] instead of string - const [transcription, setTranscription] = useState([]); + // CHANGE 1: State can now be string OR Word[] + const [transcription, setTranscription] = useState(''); + + // CHANGE 2: Add toggle for timestamps + const [enableTimestamps, setEnableTimestamps] = useState(false); const [audioURL, setAudioURL] = useState(''); const [liveTranscribing, setLiveTranscribing] = useState(false); @@ -370,6 +76,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { } const handleTranscribeFromURL = async () => { + console.log('[1] UI: Button Pressed. Calling model.stream()...'); if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; @@ -382,8 +89,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); - // model.transcribe now returns Word[], which matches our state type - setTranscription(await model.transcribe(audioBuffer)); + + // CHANGE 4: Pass the toggle flag to transcribe + // TypeScript will infer the return type based on the flag + if (enableTimestamps) { + const result = await model.transcribe(audioBuffer, { + enableTimestamps: true, + }); + setTranscription(result); + } else { + const result = await model.transcribe(audioBuffer, { + enableTimestamps: false, + }); + setTranscription(result); + } } catch (error) { console.error('Error decoding audio data', error); console.warn('Note: Supported file formats: mp3, wav, flac'); @@ -393,14 +112,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - setTranscription([]); // Reset to empty array + // Reset based on mode + setTranscription(enableTimestamps ? [] : ''); + recorder.onAudioReady(({ buffer }) => { model.streamInsert(buffer.getChannelData(0)); }); recorder.start(); try { - await model.stream(); + // CHANGE 5: Pass the toggle flag to stream + if (enableTimestamps) { + await model.stream({ enableTimestamps: true }); + } else { + await model.stream({ enableTimestamps: false }); + } } catch (error) { console.error('Error during live transcription:', error); } @@ -423,12 +149,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; - // CHANGE 3: Prepare the text for rendering - const displayedText = - transcription.length > 0 - ? getText(transcription) - : getText(model.committedTranscription) + - getText(model.nonCommittedTranscription); + // CHANGE 6: Logic to choose what text to display + // We use getText() on everything so it converts Arrays to Strings before concatenation + const hasResult = Array.isArray(transcription) + ? transcription.length > 0 + : transcription.length > 0; + + const displayedText = hasResult + ? getText(transcription) + : getText(model.committedTranscription) + + getText(model.nonCommittedTranscription); return ( @@ -450,6 +180,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} + {/* CHANGE 7: Add UI for the Toggle */} + + Enable Timestamps + { + setEnableTimestamps(val); + setTranscription(val ? [] : ''); // Reset transcription on toggle + }} + trackColor={{ false: '#767577', true: '#0f186e' }} + thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} + disabled={model.isGenerating} // Disable changing mode while running + /> + + Transcription void }) => { }; const styles = StyleSheet.create({ + // ... existing styles ... container: { flex: 1, alignItems: 'center', @@ -548,6 +294,18 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, + // New style for the toggle + toggleContainer: { + flexDirection: 'row', + alignItems: 'center', + marginTop: 10, + marginBottom: 5, + }, + toggleLabel: { + fontSize: 16, + marginRight: 10, + color: '#0f186e', + }, transcriptionContainer: { flex: 1, width: '100%', diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 38210d4cb..eeebd4b97 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -75,6 +75,11 @@ template class ModelHostObject : public JsiHostObject { promiseHostFunction<&Model::transcribe>, "transcribe")); + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::transcribeStringOnly>, + "transcribeStringOnly")); + addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::stream>, "stream")); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index a6ec82795..ad937b56c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -44,7 +44,8 @@ SpeechToText::decode(std::span tokens, } // std::vector SpeechToText::transcribe(std::span waveform, -// std::string languageOption) const { +// std::string languageOption) const +// { // std::vector segments = // this->asr->transcribe(waveform, DecodingOptions(languageOption)); // std::string transcription; @@ -85,21 +86,47 @@ std::vector SpeechToText::transcribe(std::span waveform, } } - auto wordsToJsi = [](jsi::Runtime &rt, - const std::vector &words) -> jsi::Value { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); + return transcription; +} + +std::vector +SpeechToText::transcribeStringOnly(std::span waveform, + std::string languageOption) const { + std::vector segments = + this->asr->transcribe(waveform, DecodingOptions(languageOption)); + std::string transcription; + + size_t transcriptionLength = 0; + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcriptionLength += word.content.size(); } - return jsiArr; - }; + } + transcription.reserve(transcriptionLength); - return transcription; + for (auto &segment : segments) { + for (auto &word : segment.words) { + transcription += word.content; + } + } + + return {transcription.begin(), transcription.end()}; +} + +std::vector mergeWordsToString(const std::vector &words) { + std::string result; + size_t totalLength = 0; + + for (const auto &word : words) { + totalLength += word.content.size(); + } + result.reserve(totalLength); + + for (const auto &word : words) { + result += word.content; + } + + return {result.begin(), result.end()}; } size_t SpeechToText::getMemoryLowerBound() const noexcept { @@ -153,38 +180,25 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept { // } void SpeechToText::stream(std::shared_ptr callback, - std::string languageOption) { + std::string languageOption, bool enableTimestamps) { if (this->isStreaming) { throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, "Streaming is already in progress!"); } - auto wordsToJsi = [](jsi::Runtime &rt, - const std::vector &words) -> jsi::Value { - jsi::Array jsiArr(rt, words.size()); - for (size_t i = 0; i < words.size(); ++i) { - jsi::Object obj(rt); - obj.setProperty(rt, "word", - jsi::String::createFromUtf8(rt, words[i].content)); - obj.setProperty(rt, "start", static_cast(words[i].start)); - obj.setProperty(rt, "end", static_cast(words[i].end)); - jsiArr.setValueAtIndex(rt, i, obj); - } - return jsiArr; - }; - - auto nativeCallback = [this, callback, - wordsToJsi](const std::vector &committedVec, - const std::vector &nonCommittedVec, - bool isDone) { - this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec, - isDone, wordsToJsi](jsi::Runtime &rt) { - jsi::Value committedJsi = wordsToJsi(rt, committedVec); - jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec); - - callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi), - jsi::Value(isDone)); - }); + auto nativeCallback = [this, callback](const auto &committedVec, + const auto &nonCommittedVec, + bool isDone) { + this->callInvoker->invokeAsync( + [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) { + jsi::Value committedJsi = + rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt); + jsi::Value nonCommittedJsi = + rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt); + + callback->call(rt, std::move(committedJsi), + std::move(nonCommittedJsi), jsi::Value(isDone)); + }); }; this->isStreaming = true; @@ -197,14 +211,23 @@ void SpeechToText::stream(std::shared_ptr callback, ProcessResult res = this->processor->processIter(DecodingOptions(languageOption)); - nativeCallback(res.committed, res.nonCommitted, false); + if (enableTimestamps) { + nativeCallback(res.committed, res.nonCommitted, false); + } else { + nativeCallback(mergeWordsToString(res.committed), + mergeWordsToString(res.nonCommitted), false); + } this->readyToProcess = false; } // finish() now returns std::vector std::vector committed = this->processor->finish(); - nativeCallback(committed, {}, true); + if (enableTimestamps) { + nativeCallback(committed, std::vector{}, true); + } else { + nativeCallback(mergeWordsToString(committed), std::vector(), true); + } this->resetStreamState(); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index e206f6ca7..8b525cc2d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -26,11 +26,15 @@ class SpeechToText { [[nodiscard("Registered non-void function")]] std::vector transcribe(std::span waveform, std::string languageOption) const; + [[nodiscard("Registered non-void function")]] + std::vector transcribeStringOnly(std::span waveform, + std::string languageOption) const; + size_t getMemoryLowerBound() const noexcept; // Stream void stream(std::shared_ptr callback, - std::string languageOption); + std::string languageOption, bool enableTimestamps); void streamStop(); void streamInsert(std::span waveform); diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 8285c918d..f9b5da8b1 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -1,141 +1,3 @@ -// import { useEffect, useCallback, useState } from 'react'; -// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule'; -// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt'; -// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; -// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; - -// export const useSpeechToText = ({ -// model, -// preventLoad = false, -// }: { -// model: SpeechToTextModelConfig; -// preventLoad?: boolean; -// }) => { -// const [error, setError] = useState(null); -// const [isReady, setIsReady] = useState(false); -// const [isGenerating, setIsGenerating] = useState(false); -// const [downloadProgress, setDownloadProgress] = useState(0); - -// const [modelInstance] = useState(() => new SpeechToTextModule()); -// const [committedTranscription, setCommittedTranscription] = useState(Word); -// const [nonCommittedTranscription, setNonCommittedTranscription] = -// useState(Word); - -// useEffect(() => { -// if (preventLoad) return; -// (async () => { -// setDownloadProgress(0); -// setError(null); -// try { -// setIsReady(false); -// await modelInstance.load( -// { -// isMultilingual: model.isMultilingual, -// encoderSource: model.encoderSource, -// decoderSource: model.decoderSource, -// tokenizerSource: model.tokenizerSource, -// }, -// setDownloadProgress -// ); -// setIsReady(true); -// } catch (err) { -// setError(parseUnknownError(err)); -// } -// })(); -// }, [ -// modelInstance, -// model.isMultilingual, -// model.encoderSource, -// model.decoderSource, -// model.tokenizerSource, -// preventLoad, -// ]); - -// const stateWrapper = useCallback( -// Promise>(fn: T) => -// async (...args: Parameters): Promise>> => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); -// try { -// return await fn.apply(modelInstance, args); -// } finally { -// setIsGenerating(false); -// } -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const stream = useCallback( -// async (options?: DecodingOptions) => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// if (isGenerating) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModelGenerating, -// 'The model is currently generating. Please wait until previous model run is complete.' -// ); -// setIsGenerating(true); -// setCommittedTranscription(''); -// setNonCommittedTranscription(''); -// let transcription = ''; -// try { -// for await (const { committed, nonCommitted } of modelInstance.stream( -// options -// )) { -// setCommittedTranscription((prev) => prev + committed); -// setNonCommittedTranscription(nonCommitted); -// transcription += committed; -// } -// } finally { -// setIsGenerating(false); -// } -// return transcription; -// }, -// [isReady, isGenerating, modelInstance] -// ); - -// const wrapper = useCallback( -// any>(fn: T) => { -// return (...args: Parameters): ReturnType => { -// if (!isReady) -// throw new RnExecutorchError( -// RnExecutorchErrorCode.ModuleNotLoaded, -// 'The model is currently not loaded. Please load the model before calling this function.' -// ); -// return fn.apply(modelInstance, args); -// }; -// }, -// [isReady, modelInstance] -// ); - -// return { -// error, -// isReady, -// isGenerating, -// downloadProgress, -// committedTranscription, -// nonCommittedTranscription, -// encode: stateWrapper(SpeechToTextModule.prototype.encode), -// decode: stateWrapper(SpeechToTextModule.prototype.decode), -// transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), -// stream, -// streamStop: wrapper(SpeechToTextModule.prototype.streamStop), -// streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), -// }; -// }; - import { useEffect, useCallback, useState } from 'react'; import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule'; import { @@ -164,13 +26,13 @@ export const useSpeechToText = ({ const [modelInstance] = useState(() => new SpeechToTextModule()); - // FIX 1: Initialize with empty array [], generic type Word[] - const [committedTranscription, setCommittedTranscription] = useState( - [] - ); + // FIX 1: Allow state to be either string or Word[] + const [committedTranscription, setCommittedTranscription] = useState< + string | Word[] + >(''); const [nonCommittedTranscription, setNonCommittedTranscription] = useState< - Word[] - >([]); + string | Word[] + >(''); useEffect(() => { if (preventLoad) return; @@ -193,14 +55,7 @@ export const useSpeechToText = ({ setError(parseUnknownError(err)); } })(); - }, [ - modelInstance, - model.isMultilingual, - model.encoderSource, - model.decoderSource, - model.tokenizerSource, - preventLoad, - ]); + }, [modelInstance, model, preventLoad]); const stateWrapper = useCallback( Promise>(fn: T) => @@ -208,12 +63,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'The model is currently not loaded.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' + 'The model is currently generating.' ); setIsGenerating(true); try { @@ -226,38 +81,66 @@ export const useSpeechToText = ({ ); const stream = useCallback( - async (options?: DecodingOptions) => { + async (options?: DecodingOptions & { enableTimestamps?: boolean }) => { + console.log( + '[2] Hook: Stream called. Ready:', + isReady, + 'Generating:', + isGenerating + ); if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'Model not loaded' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating. Please wait until previous model run is complete.' + 'Model is generating' ); + setIsGenerating(true); - // FIX 2: Reset to empty arrays - setCommittedTranscription([]); - setNonCommittedTranscription([]); + // FIX 2: Reset based on the mode requested + const enableTimestamps = options?.enableTimestamps ?? false; + setCommittedTranscription(enableTimestamps ? [] : ''); + setNonCommittedTranscription(enableTimestamps ? [] : ''); - // Accumulator is now an array of Words, not a string - const fullResult: Word[] = []; + let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { + console.log('[3] Hook: Calling modelInstance.stream()'); + // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe for await (const { committed, nonCommitted } of modelInstance.stream( options )) { - // FIX 3: Update state by appending arrays - if (committed.length > 0) { - setCommittedTranscription((prev) => [...prev, ...committed]); - fullResult.push(...committed); + console.log(committed, nonCommitted); + // FIX 3: Dynamic Merging Logic + if (typeof committed === 'string') { + // --- STRING MODE --- + if (committed.length > 0) { + setCommittedTranscription((prev) => { + // Safety check: if prev was somehow an array, reset it or cast to string + const prevStr = typeof prev === 'string' ? prev : ''; + return prevStr + committed; + }); + (fullResult as string) += committed; + } + setNonCommittedTranscription(nonCommitted as string); + } else { + // --- WORD[] MODE --- + const committedWords = committed as Word[]; + const nonCommittedWords = nonCommitted as Word[]; + + if (committedWords.length > 0) { + setCommittedTranscription((prev) => { + const prevArr = Array.isArray(prev) ? prev : []; + return [...prevArr, ...committedWords]; + }); + (fullResult as Word[]).push(...committedWords); + } + setNonCommittedTranscription(nonCommittedWords); } - - // nonCommitted is always a fresh partial chunk - setNonCommittedTranscription(nonCommitted); } } finally { setIsGenerating(false); @@ -273,7 +156,7 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded. Please load the model before calling this function.' + 'Model not loaded' ); return fn.apply(modelInstance, args); }; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index 803e4146a..b52f49b4b 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -12,7 +12,10 @@ export class SpeechToTextModule { private nativeModule: any; private modelConfig!: SpeechToTextModelConfig; - // 2. TextDecoder is removed as C++ now returns JS objects directly + private textDecoder = new TextDecoder('utf-8', { + fatal: false, + ignoreBOM: true, + }); /** * Loads the model specified by the config object. @@ -101,7 +104,7 @@ export class SpeechToTextModule { public async transcribe( waveform: Float32Array, options: DecodingOptions = {} - ): Promise { + ): Promise { this.validateOptions(options); const transcriptionBytes = await this.nativeModule.transcribe( waveform, @@ -125,10 +128,21 @@ export class SpeechToTextModule { */ public async *stream( options: DecodingOptions = {} - ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> { + ): AsyncGenerator<{ + committed: string | Word[]; + nonCommitted: string | Word[]; + }> { + console.log('[4] Module: Entered stream method'); this.validateOptions(options); - const queue: { committed: Word[]; nonCommitted: Word[] }[] = []; + // Ensure we strictly default to false + const enableTimestamps = options.enableTimestamps === true; + + const queue: { + committed: string | Word[]; + nonCommitted: string | Word[]; + }[] = []; + let waiter: (() => void) | null = null; let finished = false; let error: unknown; @@ -140,20 +154,34 @@ export class SpeechToTextModule { (async () => { try { - await this.nativeModule.stream( - // Callback now receives arrays of objects directly - (committed: Word[], nonCommitted: Word[], isDone: boolean) => { - queue.push({ - committed, - nonCommitted, - }); - if (isDone) { - finished = true; + const callback = ( + committed: any, + nonCommitted: any, + isDone: boolean + ) => { + if (!enableTimestamps) { + try { + queue.push({ + committed: this.textDecoder.decode(new Uint8Array(committed)), + nonCommitted: this.textDecoder.decode( + new Uint8Array(nonCommitted) + ), + }); + } catch (err) { + console.error('[Stream Decode Error]', err); } - wake(); - }, - options.language || '' - ); + } else { + queue.push({ committed, nonCommitted }); + } + + if (isDone) finished = true; + wake(); + }; + + const language = options.language || ''; + + await this.nativeModule.stream(callback, language, enableTimestamps); + finished = true; wake(); } catch (e) { diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts index ece8d6020..cfabb2313 100644 --- a/packages/react-native-executorch/src/types/stt.ts +++ b/packages/react-native-executorch/src/types/stt.ts @@ -195,6 +195,7 @@ export type SpeechToTextLanguage = */ export interface DecodingOptions { language?: SpeechToTextLanguage; + enableTimestamps?: boolean; } /** From 2d119d21dd7d950e5563ad6cc96bd132e3d73f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 15:54:28 +0100 Subject: [PATCH 05/49] Clear files --- apps/speech/screens/SpeechToTextScreen.tsx | 23 ++---- .../host_objects/JsiConversions.h | 4 -- .../models/speech_to_text/SpeechToText.cpp | 70 ------------------- .../SpeechToTextModule.ts | 2 - 4 files changed, 5 insertions(+), 94 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1e12163f4..f513dbc06 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -8,7 +8,7 @@ import { TextInput, KeyboardAvoidingView, Platform, - Switch, // Import Switch + Switch, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { @@ -33,10 +33,8 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { model: WHISPER_TINY_EN, }); - // CHANGE 1: State can now be string OR Word[] const [transcription, setTranscription] = useState(''); - // CHANGE 2: Add toggle for timestamps const [enableTimestamps, setEnableTimestamps] = useState(false); const [audioURL, setAudioURL] = useState(''); @@ -76,7 +74,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { } const handleTranscribeFromURL = async () => { - console.log('[1] UI: Button Pressed. Calling model.stream()...'); if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; @@ -94,12 +91,12 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: true, + enableTimestamps: true }); setTranscription(result); } else { const result = await model.transcribe(audioBuffer, { - enableTimestamps: false, + enableTimestamps: false }); setTranscription(result); } @@ -112,7 +109,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const handleStartTranscribeFromMicrophone = async () => { setLiveTranscribing(true); - // Reset based on mode setTranscription(enableTimestamps ? [] : ''); recorder.onAudioReady(({ buffer }) => { @@ -121,12 +117,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { recorder.start(); try { - // CHANGE 5: Pass the toggle flag to stream - if (enableTimestamps) { - await model.stream({ enableTimestamps: true }); - } else { - await model.stream({ enableTimestamps: false }); - } + await model.stream({ enableTimestamps: enableTimestamps }); } catch (error) { console.error('Error during live transcription:', error); } @@ -149,11 +140,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const readyToTranscribe = !model.isGenerating && model.isReady; const recordingButtonDisabled = isSimulator || !readyToTranscribe; - // CHANGE 6: Logic to choose what text to display - // We use getText() on everything so it converts Arrays to Strings before concatenation - const hasResult = Array.isArray(transcription) - ? transcription.length > 0 - : transcription.length > 0; + const hasResult = transcription.length > 0; const displayedText = hasResult ? getText(transcription) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 184b66a4f..9e2df2fc8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -69,15 +69,11 @@ template <> inline Word getValue(const jsi::Value &val, jsi::Runtime &runtime) { jsi::Object obj = val.asObject(runtime); - // 1. Extract the string "word" using the existing string helper std::string content = getValue(obj.getProperty(runtime, "word"), runtime); - // 2. Extract start/end times - // We use .asNumber() directly as these are primitives double start = obj.getProperty(runtime, "start").asNumber(); double end = obj.getProperty(runtime, "end").asNumber(); - // 3. Construct and return the C++ Word struct return Word{ .content = std::move(content), .start = static_cast(start), diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index ad937b56c..04b242454 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -43,30 +43,6 @@ SpeechToText::decode(std::span tokens, return std::make_shared(decoderOutput); } -// std::vector SpeechToText::transcribe(std::span waveform, -// std::string languageOption) const -// { -// std::vector segments = -// this->asr->transcribe(waveform, DecodingOptions(languageOption)); -// std::string transcription; - -// size_t transcriptionLength = 0; -// for (auto &segment : segments) { -// for (auto &word : segment.words) { -// transcriptionLength += word.content.size(); -// } -// } -// transcription.reserve(transcriptionLength); - -// for (auto &segment : segments) { -// for (auto &word : segment.words) { -// transcription += word.content; -// } -// } - -// return {transcription.begin(), transcription.end()}; -// } - std::vector SpeechToText::transcribe(std::span waveform, std::string languageOption) const { std::vector segments = @@ -134,51 +110,6 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept { this->decoder->getMemoryLowerBound(); } -// void SpeechToText::stream(std::shared_ptr callback, -// std::string languageOption) { -// if (this->isStreaming) { -// throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, -// "Streaming is already in progress!"); -// } - -// auto nativeCallback = -// [this, callback](const std::vector &committedVec, -// const std::vector &nonCommittedVec, bool isDone) -// { -// this->callInvoker->invokeAsync([callback, committedVec, -// nonCommittedVec, -// isDone](jsi::Runtime &rt) { -// callback->call( -// rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, -// rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, -// rt), jsi::Value(isDone)); -// }); -// }; - -// this->isStreaming = true; -// while (this->isStreaming) { -// if (!this->readyToProcess || -// this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) -// { -// std::this_thread::sleep_for(std::chrono::milliseconds(100)); -// continue; -// } -// ProcessResult res = -// this->processor->processIter(DecodingOptions(languageOption)); - -// nativeCallback({res.committed.begin(), res.committed.end()}, -// {res.nonCommitted.begin(), res.nonCommitted.end()}, -// false); -// this->readyToProcess = false; -// } - -// std::string committed = this->processor->finish(); - -// nativeCallback({committed.begin(), committed.end()}, {}, true); - -// this->resetStreamState(); -// } - void SpeechToText::stream(std::shared_ptr callback, std::string languageOption, bool enableTimestamps) { if (this->isStreaming) { @@ -220,7 +151,6 @@ void SpeechToText::stream(std::shared_ptr callback, this->readyToProcess = false; } - // finish() now returns std::vector std::vector committed = this->processor->finish(); if (enableTimestamps) { diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index b52f49b4b..cc5669b27 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -132,10 +132,8 @@ export class SpeechToTextModule { committed: string | Word[]; nonCommitted: string | Word[]; }> { - console.log('[4] Module: Entered stream method'); this.validateOptions(options); - // Ensure we strictly default to false const enableTimestamps = options.enableTimestamps === true; const queue: { From 30b76cf5f59dd7cfa315517f2b7d1cb358d077b7 Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Wed, 21 Jan 2026 16:36:46 +0100 Subject: [PATCH 06/49] Apply suggestions from code review --- apps/speech/screens/SpeechToTextScreen.tsx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index f513dbc06..735e66d78 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -91,12 +91,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: true - }); - setTranscription(result); - } else { - const result = await model.transcribe(audioBuffer, { - enableTimestamps: false + enableTimestamps: enableTimestamps }); setTranscription(result); } @@ -167,18 +162,17 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Status: {getModelStatus()} - {/* CHANGE 7: Add UI for the Toggle */} Enable Timestamps { setEnableTimestamps(val); - setTranscription(val ? [] : ''); // Reset transcription on toggle + setTranscription(val ? [] : ''); }} trackColor={{ false: '#767577', true: '#0f186e' }} thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'} - disabled={model.isGenerating} // Disable changing mode while running + disabled={model.isGenerating} /> From 084cf1efced8710cbaa564753d05fb90f6916c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 16:46:49 +0100 Subject: [PATCH 07/49] Apply further clearing --- apps/speech/screens/SpeechToTextScreen.tsx | 4 --- .../host_objects/JsiConversions.h | 1 - .../host_objects/ModelHostObject.h | 3 --- .../stream/OnlineASRProcessor.cpp | 25 ++++++------------- .../stream/OnlineASRProcessor.h | 3 --- .../speech_to_text/types/ProcessResult.h | 5 ---- .../useSpeechToText.ts | 14 ----------- 7 files changed, 7 insertions(+), 48 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 735e66d78..5fba8d055 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -87,8 +87,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const decodedAudioData = await audioContext.decodeAudioDataSource(uri); const audioBuffer = decodedAudioData.getChannelData(0); - // CHANGE 4: Pass the toggle flag to transcribe - // TypeScript will infer the return type based on the flag if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { enableTimestamps: enableTimestamps @@ -243,7 +241,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { }; const styles = StyleSheet.create({ - // ... existing styles ... container: { flex: 1, alignItems: 'center', @@ -275,7 +272,6 @@ const styles = StyleSheet.create({ marginTop: 12, alignItems: 'center', }, - // New style for the toggle toggleContainer: { flexDirection: 'row', alignItems: 'center', diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 9e2df2fc8..d9d49b6b4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -326,7 +326,6 @@ inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) { inline jsi::Value getJsiValue(const std::vector &vec, jsi::Runtime &runtime) { jsi::Array array(runtime, vec.size()); for (size_t i = 0; i < vec.size(); ++i) { - // Convert each Word using the helper above and place in array array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime)); } return {runtime, array}; diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index eeebd4b97..50797417e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -20,14 +20,11 @@ #include #include #include -#include #include #include #include #include -using rnexecutorch::models::speech_to_text::types::Word; - namespace rnexecutorch { template class ModelHostObject : public JsiHostObject { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp index f62986b72..3137d274b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp @@ -34,12 +34,14 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) { chunkCompletedSegment(res); } + auto move_to_vector = [](auto& container) { + return std::vector(std::make_move_iterator(container.begin()), + std::make_move_iterator(container.end())); + }; + std::deque nonCommittedWords = this->hypothesisBuffer.complete(); - // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)}; - return {std::vector(std::make_move_iterator(flushed.begin()), - std::make_move_iterator(flushed.end())), - std::vector(std::make_move_iterator(nonCommittedWords.begin()), - std::make_move_iterator(nonCommittedWords.end()))}; + + return { move_to_vector(flushed), move_to_vector(nonCommittedWords) }; } void OnlineASRProcessor::chunkCompletedSegment(std::span res) { @@ -86,22 +88,9 @@ std::vector OnlineASRProcessor::finish() { std::vector buffer(std::make_move_iterator(bufferDeq.begin()), std::make_move_iterator(bufferDeq.end())); - // std::string committedText = this->toFlush(buffer); this->bufferTimeOffset += static_cast(audioBuffer.size()) / OnlineASRProcessor::kSamplingRate; return buffer; } -// std::string OnlineASRProcessor::toFlush(const std::deque &words) const -// { -// std::string text; -// text.reserve(std::accumulate( -// words.cbegin(), words.cend(), 0, -// [](size_t sum, const Word &w) { return sum + w.content.size(); })); -// for (const auto &word : words) { -// text.append(word.content); -// } -// return text; -// } - } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h index 720e6bf76..3abaad3b6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h @@ -12,7 +12,6 @@ class OnlineASRProcessor { void insertAudioChunk(std::span audio); types::ProcessResult processIter(const types::DecodingOptions &options); - // std::string finish(); std::vector finish(); std::vector audioBuffer; @@ -27,8 +26,6 @@ class OnlineASRProcessor { void chunkCompletedSegment(std::span res); void chunkAt(float time); - - // std::string toFlush(const std::deque &words) const; }; } // namespace rnexecutorch::models::speech_to_text::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h index 685ba2b76..681495e2a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h @@ -4,11 +4,6 @@ namespace rnexecutorch::models::speech_to_text::types { -// struct ProcessResult { -// std::string committed; -// std::string nonCommitted; -// }; - struct ProcessResult { std::vector committed; std::vector nonCommitted; diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index f9b5da8b1..f9af79a54 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -26,7 +26,6 @@ export const useSpeechToText = ({ const [modelInstance] = useState(() => new SpeechToTextModule()); - // FIX 1: Allow state to be either string or Word[] const [committedTranscription, setCommittedTranscription] = useState< string | Word[] >(''); @@ -82,12 +81,6 @@ export const useSpeechToText = ({ const stream = useCallback( async (options?: DecodingOptions & { enableTimestamps?: boolean }) => { - console.log( - '[2] Hook: Stream called. Ready:', - isReady, - 'Generating:', - isGenerating - ); if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, @@ -101,7 +94,6 @@ export const useSpeechToText = ({ setIsGenerating(true); - // FIX 2: Reset based on the mode requested const enableTimestamps = options?.enableTimestamps ?? false; setCommittedTranscription(enableTimestamps ? [] : ''); setNonCommittedTranscription(enableTimestamps ? [] : ''); @@ -109,18 +101,13 @@ export const useSpeechToText = ({ let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { - console.log('[3] Hook: Calling modelInstance.stream()'); - // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe for await (const { committed, nonCommitted } of modelInstance.stream( options )) { console.log(committed, nonCommitted); - // FIX 3: Dynamic Merging Logic if (typeof committed === 'string') { - // --- STRING MODE --- if (committed.length > 0) { setCommittedTranscription((prev) => { - // Safety check: if prev was somehow an array, reset it or cast to string const prevStr = typeof prev === 'string' ? prev : ''; return prevStr + committed; }); @@ -128,7 +115,6 @@ export const useSpeechToText = ({ } setNonCommittedTranscription(nonCommitted as string); } else { - // --- WORD[] MODE --- const committedWords = committed as Word[]; const nonCommittedWords = nonCommitted as Word[]; From 24c4606729637f6804669bc869780f223ce0c52e Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Wed, 21 Jan 2026 17:15:36 +0100 Subject: [PATCH 08/49] Apply suggestion from @msluszniak --- .../hooks/natural_language_processing/useSpeechToText.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index f9af79a54..17d05962d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -54,7 +54,14 @@ export const useSpeechToText = ({ setError(parseUnknownError(err)); } })(); - }, [modelInstance, model, preventLoad]); + }, [ + modelInstance, + model.isMultilingual, + model.encoderSource, + model.decoderSource, + model.tokenizerSource, + preventLoad, + ]); const stateWrapper = useCallback( Promise>(fn: T) => From 8b019fe1df6fb17e7804fde690e1bface6048a49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 17:31:07 +0100 Subject: [PATCH 09/49] Apply autofix lint changes --- apps/speech/screens/SpeechToTextScreen.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 5fba8d055..b1693968b 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -89,7 +89,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { if (enableTimestamps) { const result = await model.transcribe(audioBuffer, { - enableTimestamps: enableTimestamps + enableTimestamps: enableTimestamps, }); setTranscription(result); } From 5eab00da023019fb99c0959acd8c3b1c2287124c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 17:57:16 +0100 Subject: [PATCH 10/49] Fix linter issues --- apps/llm/app/voice_chat/index.tsx | 14 ++++++++++-- .../useSpeechToText.ts | 22 +++++++++++++------ .../SpeechToTextModule.ts | 2 +- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx index 79a713c93..0bf4c9b30 100644 --- a/apps/llm/app/voice_chat/index.tsx +++ b/apps/llm/app/voice_chat/index.tsx @@ -76,7 +76,11 @@ function VoiceChatScreen() { }); recorder.start(); const transcription = await speechToText.stream(); - await llm.sendMessage(transcription); + await llm.sendMessage( + typeof transcription === 'string' + ? transcription + : transcription.map((w) => w.word).join(' ') + ); } }; @@ -105,7 +109,13 @@ function VoiceChatScreen() { ...llm.messageHistory, { role: 'user', - content: speechToText.committedTranscription, + content: + typeof speechToText.committedTranscription === + 'string' + ? speechToText.committedTranscription + : speechToText.committedTranscription + .map((w) => w.word) + .join(' '), }, ] : llm.messageHistory diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 17d05962d..083cdaf2d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -108,11 +108,17 @@ export const useSpeechToText = ({ let fullResult: string | Word[] = enableTimestamps ? [] : ''; try { - for await (const { committed, nonCommitted } of modelInstance.stream( - options - )) { - console.log(committed, nonCommitted); + const streamGen = modelInstance.stream( + options as any + ) as AsyncGenerator<{ + committed: string | Word[]; + nonCommitted: string | Word[]; + }>; + + for await (const { committed, nonCommitted } of streamGen) { if (typeof committed === 'string') { + const nc = nonCommitted as unknown as string; + if (committed.length > 0) { setCommittedTranscription((prev) => { const prevStr = typeof prev === 'string' ? prev : ''; @@ -120,12 +126,12 @@ export const useSpeechToText = ({ }); (fullResult as string) += committed; } - setNonCommittedTranscription(nonCommitted as string); + setNonCommittedTranscription(nc); } else { const committedWords = committed as Word[]; const nonCommittedWords = nonCommitted as Word[]; - if (committedWords.length > 0) { + if (committedWords && committedWords.length > 0) { setCommittedTranscription((prev) => { const prevArr = Array.isArray(prev) ? prev : []; return [...prevArr, ...committedWords]; @@ -166,7 +172,9 @@ export const useSpeechToText = ({ nonCommittedTranscription, encode: stateWrapper(SpeechToTextModule.prototype.encode), decode: stateWrapper(SpeechToTextModule.prototype.decode), - transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe), + transcribe: stateWrapper( + SpeechToTextModule.prototype.transcribe + ) as SpeechToTextModule['transcribe'], stream, streamStop: wrapper(SpeechToTextModule.prototype.streamStop), streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert), diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index cc5669b27..5891e4cd5 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -166,7 +166,7 @@ export class SpeechToTextModule { ), }); } catch (err) { - console.error('[Stream Decode Error]', err); + Logger.error('[Stream Decode Error]', err); } } else { queue.push({ committed, nonCommitted }); From db68c22089974f663128f05188053501084361ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 18:40:42 +0100 Subject: [PATCH 11/49] Revert changing error messages --- .../hooks/natural_language_processing/useSpeechToText.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 083cdaf2d..9df9a88b1 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -69,12 +69,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded.' + 'The model is currently not loaded. Please load the model before calling this function.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating.' + 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); try { @@ -91,12 +91,12 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'Model not loaded' + 'The model is currently not loaded. Please load the model before calling this function.' ); if (isGenerating) throw new RnExecutorchError( RnExecutorchErrorCode.ModelGenerating, - 'Model is generating' + 'The model is currently generating. Please wait until previous model run is complete.' ); setIsGenerating(true); From 2a69753ec44bcf68d1a97f9fc6ebce37dc7bfd70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 18:42:25 +0100 Subject: [PATCH 12/49] Revert one more message --- .../src/hooks/natural_language_processing/useSpeechToText.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 9df9a88b1..611ec3153 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -155,7 +155,7 @@ export const useSpeechToText = ({ if (!isReady) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, - 'Model not loaded' + 'The model is currently not loaded. Please load the model before calling this function.' ); return fn.apply(modelInstance, args); }; From 11e01e837c00808420d02a6fdc5d67dcb2130308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Wed, 21 Jan 2026 19:42:44 +0100 Subject: [PATCH 13/49] Update docs --- .../useSpeechToText.md | 71 +++++++++++++++---- .../SpeechToTextModule.md | 27 ++++++- 2 files changed, 82 insertions(+), 16 deletions(-) diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 5b0545cf2..85c049f9c 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -83,6 +83,7 @@ Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechTo To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md). + ## Running the model Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails. @@ -101,12 +102,25 @@ const model = useSpeechToText({ const transcription = await model.transcribe(spanishAudio, { language: 'es' }); ``` +### Timestamps + +You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects. + +```typescript +const words = await model.transcribe(audioBuffer, { enableTimestamps: true }); +// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...] +``` + ## Example ```tsx import React, { useState } from 'react'; -import { Button, Text } from 'react-native'; -import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch'; +import { Button, Text, View } from 'react-native'; +import { + useSpeechToText, + WHISPER_TINY_EN, + Word, +} from 'react-native-executorch'; import { AudioContext } from 'react-native-audio-api'; import * as FileSystem from 'expo-file-system'; @@ -115,7 +129,7 @@ function App() { model: WHISPER_TINY_EN, }); - const [transcription, setTranscription] = useState(''); + const [transcription, setTranscription] = useState(''); const loadAudio = async () => { const { uri } = await FileSystem.downloadAsync( @@ -132,14 +146,38 @@ function App() { const handleTranscribe = async () => { const audio = await loadAudio(); - await model.transcribe(audio); + // Default text transcription + const result = await model.transcribe(audio); + setTranscription(result); + }; + + const handleTranscribeWithTimestamps = async () => { + const audio = await loadAudio(); + // Transcription with timestamps + const result = await model.transcribe(audio, { enableTimestamps: true }); + setTranscription(result); + }; + + const renderContent = () => { + if (typeof transcription === 'string') { + return {transcription}; + } + return transcription.map((w, i) => ( + + {w.word} ({w.start.toFixed(2)}s) + + )); }; return ( - <> - {transcription} -