From 89029cb02d3f0371d5dbc8939e43450e2183c314 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 14:16:54 +0100
Subject: [PATCH 01/49] Draft of changes introducing timestamping

---
 .../host_objects/JsiConversions.h             |  14 ++
 .../models/speech_to_text/SpeechToText.cpp    | 127 +++++++++++--
 .../stream/OnlineASRProcessor.cpp             |  32 ++--
 .../stream/OnlineASRProcessor.h               |   5 +-
 .../useSpeechToText.ts                        | 171 +++++++++++++++++-
 .../SpeechToTextModule.ts                     |  24 +--
 6 files changed, 319 insertions(+), 54 deletions(-)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 2baf922db..570ba0939 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -62,6 +62,20 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
       val.asObject(runtime).asFunction(runtime));
 }
 
+template <>
+inline getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
+  jsi::Array jsiArr(rt, words.size());
+  for (size_t i = 0; i < words.size(); ++i) {
+    jsi::Object obj(rt);
+    obj.setProperty(rt, "word",
+                    jsi::String::createFromUtf8(rt, words[i].content));
+    obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+    obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+    jsiArr.setValueAtIndex(rt, i, obj);
+  }
+  return jsiArr;
+};
+
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
                                                jsi::Runtime &runtime) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 6299c9c40..f026f30a9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -66,11 +66,92 @@ std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
   return {transcription.begin(), transcription.end()};
 }
 
+std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
+                                           std::string languageOption) const {
+  std::vector<Segment> segments =
+      this->asr->transcribe(waveform, DecodingOptions(languageOption));
+  std::vector<Word> transcription;
+
+  size_t transcriptionLength = 0;
+  for (auto &segment : segments) {
+    transcriptionLength += segment.words.size();
+  }
+
+  transcription.reserve(segments.size());
+
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcription.push_back(word);
+    }
+  }
+
+  auto wordsToJsi = [](jsi::Runtime &rt,
+                       const std::vector<Word> &words) -> jsi::Value {
+    jsi::Array jsiArr(rt, words.size());
+    for (size_t i = 0; i < words.size(); ++i) {
+      jsi::Object obj(rt);
+      obj.setProperty(rt, "word",
+                      jsi::String::createFromUtf8(rt, words[i].content));
+      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+      jsiArr.setValueAtIndex(rt, i, obj);
+    }
+    return jsiArr;
+  };
+
+  return transcription;
+}
+
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
   return this->encoder->getMemoryLowerBound() +
          this->decoder->getMemoryLowerBound();
 }
 
+// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
+//                           std::string languageOption) {
+//   if (this->isStreaming) {
+//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
+//                             "Streaming is already in progress!");
+//   }
+
+//   auto nativeCallback =
+//       [this, callback](const std::vector<char> &committedVec,
+//                        const std::vector<char> &nonCommittedVec, bool isDone)
+//                        {
+//         this->callInvoker->invokeAsync([callback, committedVec,
+//         nonCommittedVec,
+//                                         isDone](jsi::Runtime &rt) {
+//           callback->call(
+//               rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec,
+//               rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec,
+//               rt), jsi::Value(isDone));
+//         });
+//       };
+
+//   this->isStreaming = true;
+//   while (this->isStreaming) {
+//     if (!this->readyToProcess ||
+//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
+//         {
+//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
+//       continue;
+//     }
+//     ProcessResult res =
+//         this->processor->processIter(DecodingOptions(languageOption));
+
+//     nativeCallback({res.committed.begin(), res.committed.end()},
+//                    {res.nonCommitted.begin(), res.nonCommitted.end()},
+//                    false);
+//     this->readyToProcess = false;
+//   }
+
+//   std::string committed = this->processor->finish();
+
+//   nativeCallback({committed.begin(), committed.end()}, {}, true);
+
+//   this->resetStreamState();
+// }
+
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption) {
   if (this->isStreaming) {
@@ -78,17 +159,33 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                             "Streaming is already in progress!");
   }
 
-  auto nativeCallback =
-      [this, callback](const std::vector<char> &committedVec,
-                       const std::vector<char> &nonCommittedVec, bool isDone) {
-        this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
-                                        isDone](jsi::Runtime &rt) {
-          callback->call(
-              rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt),
-              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt),
-              jsi::Value(isDone));
-        });
-      };
+  auto wordsToJsi = [](jsi::Runtime &rt,
+                       const std::vector<Word> &words) -> jsi::Value {
+    jsi::Array jsiArr(rt, words.size());
+    for (size_t i = 0; i < words.size(); ++i) {
+      jsi::Object obj(rt);
+      obj.setProperty(rt, "word",
+                      jsi::String::createFromUtf8(rt, words[i].content));
+      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
+      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
+      jsiArr.setValueAtIndex(rt, i, obj);
+    }
+    return jsiArr;
+  };
+
+  auto nativeCallback = [this, callback,
+                         wordsToJsi](const std::vector<Word> &committedVec,
+                                     const std::vector<Word> &nonCommittedVec,
+                                     bool isDone) {
+    this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
+                                    isDone, wordsToJsi](jsi::Runtime &rt) {
+      jsi::Value committedJsi = wordsToJsi(rt, committedVec);
+      jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec);
+
+      callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi),
+                     jsi::Value(isDone));
+    });
+  };
 
   this->isStreaming = true;
   while (this->isStreaming) {
@@ -100,14 +197,14 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     ProcessResult res =
         this->processor->processIter(DecodingOptions(languageOption));
 
-    nativeCallback({res.committed.begin(), res.committed.end()},
-                   {res.nonCommitted.begin(), res.nonCommitted.end()}, false);
+    nativeCallback(res.committed, res.nonCommitted, false);
     this->readyToProcess = false;
   }
 
-  std::string committed = this->processor->finish();
+  // finish() now returns std::vector<Word>
+  std::vector<Word> committed = this->processor->finish();
 
-  nativeCallback({committed.begin(), committed.end()}, {}, true);
+  nativeCallback(committed, {}, true);
 
   this->resetStreamState();
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index c6a99e9a2..b8a7aced4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -77,23 +77,27 @@ void OnlineASRProcessor::chunkAt(float time) {
   this->bufferTimeOffset = time;
 }
 
-std::string OnlineASRProcessor::finish() {
-  const std::deque<Word> buffer = this->hypothesisBuffer.complete();
-  std::string committedText = this->toFlush(buffer);
+std::vector<Word> OnlineASRProcessor::finish() {
+  std::deque<Word> bufferDeq = this->hypothesisBuffer.complete();
+  std::vector<Word> buffer(std::make_move_iterator(bufferDeq.begin()),
+                           std::make_move_iterator(bufferDeq.end()));
+
+  // std::string committedText = this->toFlush(buffer);
   this->bufferTimeOffset += static_cast<float>(audioBuffer.size()) /
                             OnlineASRProcessor::kSamplingRate;
-  return committedText;
+  return buffer;
 }
 
-std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const {
-  std::string text;
-  text.reserve(std::accumulate(
-      words.cbegin(), words.cend(), 0,
-      [](size_t sum, const Word &w) { return sum + w.content.size(); }));
-  for (const auto &word : words) {
-    text.append(word.content);
-  }
-  return text;
-}
+// std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const
+// {
+//   std::string text;
+//   text.reserve(std::accumulate(
+//       words.cbegin(), words.cend(), 0,
+//       [](size_t sum, const Word &w) { return sum + w.content.size(); }));
+//   for (const auto &word : words) {
+//     text.append(word.content);
+//   }
+//   return text;
+// }
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index c50b56271..720e6bf76 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -12,7 +12,8 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  std::string finish();
+  // std::string finish();
+  std::vector<Word> finish();
 
   std::vector<float> audioBuffer;
 
@@ -27,7 +28,7 @@ class OnlineASRProcessor {
   void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
 
-  std::string toFlush(const std::deque<types::Word> &words) const;
+  // std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 74734c35e..8285c918d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,3 +1,141 @@
+// import { useEffect, useCallback, useState } from 'react';
+// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule';
+// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
+// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
+
+// export const useSpeechToText = ({
+//   model,
+//   preventLoad = false,
+// }: {
+//   model: SpeechToTextModelConfig;
+//   preventLoad?: boolean;
+// }) => {
+//   const [error, setError] = useState<null | RnExecutorchError>(null);
+//   const [isReady, setIsReady] = useState(false);
+//   const [isGenerating, setIsGenerating] = useState(false);
+//   const [downloadProgress, setDownloadProgress] = useState(0);
+
+//   const [modelInstance] = useState(() => new SpeechToTextModule());
+//   const [committedTranscription, setCommittedTranscription] = useState(Word);
+//   const [nonCommittedTranscription, setNonCommittedTranscription] =
+//     useState(Word);
+
+//   useEffect(() => {
+//     if (preventLoad) return;
+//     (async () => {
+//       setDownloadProgress(0);
+//       setError(null);
+//       try {
+//         setIsReady(false);
+//         await modelInstance.load(
+//           {
+//             isMultilingual: model.isMultilingual,
+//             encoderSource: model.encoderSource,
+//             decoderSource: model.decoderSource,
+//             tokenizerSource: model.tokenizerSource,
+//           },
+//           setDownloadProgress
+//         );
+//         setIsReady(true);
+//       } catch (err) {
+//         setError(parseUnknownError(err));
+//       }
+//     })();
+//   }, [
+//     modelInstance,
+//     model.isMultilingual,
+//     model.encoderSource,
+//     model.decoderSource,
+//     model.tokenizerSource,
+//     preventLoad,
+//   ]);
+
+//   const stateWrapper = useCallback(
+//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
+//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         if (isGenerating)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModelGenerating,
+//             'The model is currently generating. Please wait until previous model run is complete.'
+//           );
+//         setIsGenerating(true);
+//         try {
+//           return await fn.apply(modelInstance, args);
+//         } finally {
+//           setIsGenerating(false);
+//         }
+//       },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const stream = useCallback(
+//     async (options?: DecodingOptions) => {
+//       if (!isReady)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModuleNotLoaded,
+//           'The model is currently not loaded. Please load the model before calling this function.'
+//         );
+//       if (isGenerating)
+//         throw new RnExecutorchError(
+//           RnExecutorchErrorCode.ModelGenerating,
+//           'The model is currently generating. Please wait until previous model run is complete.'
+//         );
+//       setIsGenerating(true);
+//       setCommittedTranscription('');
+//       setNonCommittedTranscription('');
+//       let transcription = '';
+//       try {
+//         for await (const { committed, nonCommitted } of modelInstance.stream(
+//           options
+//         )) {
+//           setCommittedTranscription((prev) => prev + committed);
+//           setNonCommittedTranscription(nonCommitted);
+//           transcription += committed;
+//         }
+//       } finally {
+//         setIsGenerating(false);
+//       }
+//       return transcription;
+//     },
+//     [isReady, isGenerating, modelInstance]
+//   );
+
+//   const wrapper = useCallback(
+//     <T extends (...args: any[]) => any>(fn: T) => {
+//       return (...args: Parameters<T>): ReturnType<T> => {
+//         if (!isReady)
+//           throw new RnExecutorchError(
+//             RnExecutorchErrorCode.ModuleNotLoaded,
+//             'The model is currently not loaded. Please load the model before calling this function.'
+//           );
+//         return fn.apply(modelInstance, args);
+//       };
+//     },
+//     [isReady, modelInstance]
+//   );
+
+//   return {
+//     error,
+//     isReady,
+//     isGenerating,
+//     downloadProgress,
+//     committedTranscription,
+//     nonCommittedTranscription,
+//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
+//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
+//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+//     stream,
+//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
+//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
+//   };
+// };
+
 import { useEffect, useCallback, useState } from 'react';
 import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
 import {
@@ -25,9 +163,14 @@ export const useSpeechToText = ({
   const [downloadProgress, setDownloadProgress] = useState(0);
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
-  const [committedTranscription, setCommittedTranscription] = useState('');
-  const [nonCommittedTranscription, setNonCommittedTranscription] =
-    useState('');
+
+  // FIX 1: Initialize with empty array [], generic type Word[]
+  const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
+    []
+  );
+  const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
+    Word[]
+  >([]);
 
   useEffect(() => {
     if (preventLoad) return;
@@ -95,21 +238,31 @@ export const useSpeechToText = ({
           'The model is currently generating. Please wait until previous model run is complete.'
         );
       setIsGenerating(true);
-      setCommittedTranscription('');
-      setNonCommittedTranscription('');
-      let transcription = '';
+
+      // FIX 2: Reset to empty arrays
+      setCommittedTranscription([]);
+      setNonCommittedTranscription([]);
+
+      // Accumulator is now an array of Words, not a string
+      const fullResult: Word[] = [];
+
       try {
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
-          setCommittedTranscription((prev) => prev + committed);
+          // FIX 3: Update state by appending arrays
+          if (committed.length > 0) {
+            setCommittedTranscription((prev) => [...prev, ...committed]);
+            fullResult.push(...committed);
+          }
+
+          // nonCommitted is always a fresh partial chunk
           setNonCommittedTranscription(nonCommitted);
-          transcription += committed;
         }
       } finally {
         setIsGenerating(false);
       }
-      return transcription;
+      return fullResult;
     },
     [isReady, isGenerating, modelInstance]
   );
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 4b4f196df..803e4146a 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -10,13 +10,9 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
  */
 export class SpeechToTextModule {
   private nativeModule: any;
-
   private modelConfig!: SpeechToTextModelConfig;
 
-  private textDecoder = new TextDecoder('utf-8', {
-    fatal: false,
-    ignoreBOM: true,
-  });
+  // 2. TextDecoder is removed as C++ now returns JS objects directly
 
   /**
    * Loads the model specified by the config object.
@@ -105,13 +101,14 @@ export class SpeechToTextModule {
   public async transcribe(
     waveform: Float32Array,
     options: DecodingOptions = {}
-  ): Promise<string> {
+  ): Promise<Word[]> {
     this.validateOptions(options);
     const transcriptionBytes = await this.nativeModule.transcribe(
       waveform,
       options.language || ''
     );
-    return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
+
+    return transcriptionBytes;
   }
 
   /**
@@ -128,10 +125,10 @@ export class SpeechToTextModule {
    */
   public async *stream(
     options: DecodingOptions = {}
-  ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
+  ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> {
     this.validateOptions(options);
 
-    const queue: { committed: string; nonCommitted: string }[] = [];
+    const queue: { committed: Word[]; nonCommitted: Word[] }[] = [];
     let waiter: (() => void) | null = null;
     let finished = false;
     let error: unknown;
@@ -144,12 +141,11 @@ export class SpeechToTextModule {
     (async () => {
       try {
         await this.nativeModule.stream(
-          (committed: number[], nonCommitted: number[], isDone: boolean) => {
+          // Callback now receives arrays of objects directly
+          (committed: Word[], nonCommitted: Word[], isDone: boolean) => {
             queue.push({
-              committed: this.textDecoder.decode(new Uint8Array(committed)),
-              nonCommitted: this.textDecoder.decode(
-                new Uint8Array(nonCommitted)
-              ),
+              committed,
+              nonCommitted,
             });
             if (isDone) {
               finished = true;

From 995d81f43c17fbe92877ad24f7b6e3e05259157a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 14:44:24 +0100
Subject: [PATCH 02/49] Add missing headers

---
 .../common/rnexecutorch/host_objects/JsiConversions.h            | 1 +
 .../common/rnexecutorch/host_objects/ModelHostObject.h           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 570ba0939..bf1147162 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -18,6 +18,7 @@
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
+#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
 namespace rnexecutorch::jsi_conversion {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index a1ce8e8e8..815964aed 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -20,6 +20,7 @@
 #include <rnexecutorch/models/llm/LLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
+#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
 #include <rnexecutorch/models/text_to_speech/TextToSpeech.h>
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>

From 27910a4fc66762a1c074c9bcbc356cb73377a96b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 20 Jan 2026 21:59:21 +0100
Subject: [PATCH 03/49] Add draft of working version for timestamps only

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 325 +++++++++++++++++-
 .../host_objects/JsiConversions.h             |  49 ++-
 .../host_objects/ModelHostObject.h            |   2 +
 .../models/speech_to_text/SpeechToText.cpp    |  40 +--
 .../stream/OnlineASRProcessor.cpp             |   6 +-
 .../speech_to_text/types/ProcessResult.h      |   9 +-
 6 files changed, 387 insertions(+), 44 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index da7ed0f7e..542d5dd01 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -1,3 +1,300 @@
+// import React, { useEffect, useRef, useState } from 'react';
+// import {
+//   Text,
+//   View,
+//   StyleSheet,
+//   TouchableOpacity,
+//   ScrollView,
+//   TextInput,
+//   KeyboardAvoidingView,
+//   Platform,
+// } from 'react-native';
+// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
+// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+// import FontAwesome from '@expo/vector-icons/FontAwesome';
+// import {
+//   AudioManager,
+//   AudioRecorder,
+//   AudioContext,
+// } from 'react-native-audio-api';
+// import * as FileSystem from 'expo-file-system/legacy';
+// import SWMIcon from '../assets/swm_icon.svg';
+// import DeviceInfo from 'react-native-device-info';
+
+// const isSimulator = DeviceInfo.isEmulatorSync();
+
+// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
+//   const model = useSpeechToText({
+//     model: WHISPER_TINY_EN,
+//   });
+
+//   const [transcription, setTranscription] = useState('');
+//   const [audioURL, setAudioURL] = useState('');
+//   const [liveTranscribing, setLiveTranscribing] = useState(false);
+//   const scrollViewRef = useRef<ScrollView>(null);
+
+//   const [recorder] = useState(
+//     () =>
+//       new AudioRecorder({
+//         sampleRate: 16000,
+//         bufferLengthInSamples: 1600,
+//       })
+//   );
+
+//   useEffect(() => {
+//     AudioManager.setAudioSessionOptions({
+//       iosCategory: 'playAndRecord',
+//       iosMode: 'spokenAudio',
+//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
+//     });
+//     AudioManager.requestRecordingPermissions();
+//   }, []);
+
+//   const handleTranscribeFromURL = async () => {
+//     if (!audioURL.trim()) {
+//       console.warn('Please provide a valid audio file URL');
+//       return;
+//     }
+
+//     const { uri } = await FileSystem.downloadAsync(
+//       audioURL,
+//       FileSystem.cacheDirectory + 'audio_file'
+//     );
+
+//     const audioContext = new AudioContext({ sampleRate: 16000 });
+
+//     try {
+//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+//       const audioBuffer = decodedAudioData.getChannelData(0);
+//       setTranscription(await model.transcribe(audioBuffer));
+//     } catch (error) {
+//       console.error('Error decoding audio data', error);
+//       console.warn('Note: Supported file formats: mp3, wav, flac');
+//       return;
+//     }
+//   };
+
+//   const handleStartTranscribeFromMicrophone = async () => {
+//     setLiveTranscribing(true);
+//     setTranscription('');
+//     recorder.onAudioReady(({ buffer }) => {
+//       model.streamInsert(buffer.getChannelData(0));
+//     });
+//     recorder.start();
+
+//     try {
+//       await model.stream();
+//     } catch (error) {
+//       console.error('Error during live transcription:', error);
+//     }
+//   };
+
+//   const handleStopTranscribeFromMicrophone = () => {
+//     recorder.stop();
+//     model.streamStop();
+//     console.log('Live transcription stopped');
+//     setLiveTranscribing(false);
+//   };
+
+//   const getModelStatus = () => {
+//     if (model.error) return `${model.error}`;
+//     if (model.isGenerating) return 'Transcribing...';
+//     if (model.isReady) return 'Ready to transcribe';
+//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
+//   };
+
+//   const readyToTranscribe = !model.isGenerating && model.isReady;
+//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
+
+//   return (
+//     <SafeAreaProvider>
+//       <SafeAreaView style={styles.container}>
+//         <KeyboardAvoidingView
+//           style={styles.keyboardAvoidingView}
+//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
+//         >
+//           <View style={styles.header}>
+//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
+//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
+//             </TouchableOpacity>
+//             <SWMIcon width={60} height={60} />
+//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
+//             <Text style={styles.headerText}>Speech to Text</Text>
+//           </View>
+
+//           <View style={styles.statusContainer}>
+//             <Text>Status: {getModelStatus()}</Text>
+//           </View>
+
+//           <View style={styles.transcriptionContainer}>
+//             <Text style={styles.transcriptionLabel}>Transcription</Text>
+//             <ScrollView
+//               ref={scrollViewRef}
+//               style={styles.transcriptionScrollContainer}
+//               onContentSizeChange={() =>
+//                 scrollViewRef.current?.scrollToEnd({ animated: true })
+//               }
+//             >
+//               <Text>
+//                 {transcription !== ''
+//                   ? transcription
+//                   : model.committedTranscription +
+//                     model.nonCommittedTranscription}
+//               </Text>
+//             </ScrollView>
+//           </View>
+
+//           <View style={styles.inputContainer}>
+//             <View style={styles.urlTranscriptionContainer}>
+//               <TextInput
+//                 placeholder="Audio file URL to transcribe"
+//                 style={styles.urlTranscriptionInput}
+//                 value={audioURL}
+//                 onChangeText={setAudioURL}
+//               />
+//               <TouchableOpacity
+//                 disabled={!readyToTranscribe}
+//                 onPress={handleTranscribeFromURL}
+//                 style={[
+//                   styles.urlTranscriptionButton,
+//                   !readyToTranscribe && styles.disabled,
+//                 ]}
+//               >
+//                 <Text style={styles.buttonText}>Start</Text>
+//               </TouchableOpacity>
+//             </View>
+
+//             {liveTranscribing ? (
+//               <TouchableOpacity
+//                 onPress={handleStopTranscribeFromMicrophone}
+//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
+//               >
+//                 <FontAwesome name="microphone-slash" size={22} color="white" />
+//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
+//               </TouchableOpacity>
+//             ) : (
+//               <TouchableOpacity
+//                 disabled={recordingButtonDisabled}
+//                 onPress={handleStartTranscribeFromMicrophone}
+//                 style={[
+//                   styles.liveTranscriptionButton,
+//                   styles.backgroundBlue,
+//                   recordingButtonDisabled && styles.disabled,
+//                 ]}
+//               >
+//                 <FontAwesome name="microphone" size={20} color="white" />
+//                 <Text style={styles.buttonText}>
+//                   {isSimulator
+//                     ? 'Recording is not available on Simulator'
+//                     : 'Start Live Transcription'}
+//                 </Text>
+//               </TouchableOpacity>
+//             )}
+//           </View>
+//         </KeyboardAvoidingView>
+//       </SafeAreaView>
+//     </SafeAreaProvider>
+//   );
+// };
+
+// const styles = StyleSheet.create({
+//   container: {
+//     flex: 1,
+//     alignItems: 'center',
+//     backgroundColor: 'white',
+//     paddingHorizontal: 16,
+//   },
+//   keyboardAvoidingView: {
+//     flex: 1,
+//     width: '100%',
+//   },
+//   header: {
+//     alignItems: 'center',
+//     position: 'relative',
+//     width: '100%',
+//   },
+//   backButton: {
+//     position: 'absolute',
+//     left: 0,
+//     top: 10,
+//     padding: 10,
+//     zIndex: 1,
+//   },
+//   headerText: {
+//     fontSize: 22,
+//     fontWeight: 'bold',
+//     color: '#0f186e',
+//   },
+//   statusContainer: {
+//     marginTop: 12,
+//     alignItems: 'center',
+//   },
+//   transcriptionContainer: {
+//     flex: 1,
+//     width: '100%',
+//     marginVertical: 12,
+//   },
+//   transcriptionLabel: {
+//     marginLeft: 12,
+//     marginBottom: 4,
+//     color: '#0f186e',
+//   },
+//   transcriptionScrollContainer: {
+//     borderRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     padding: 12,
+//   },
+//   inputContainer: {
+//     marginBottom: 12,
+//   },
+//   urlTranscriptionContainer: {
+//     width: '100%',
+//     flexDirection: 'row',
+//   },
+//   urlTranscriptionInput: {
+//     flex: 1,
+//     padding: 12,
+//     borderTopLeftRadius: 12,
+//     borderBottomLeftRadius: 12,
+//     borderWidth: 1,
+//     borderColor: '#0f186e',
+//     borderRightWidth: 0,
+//   },
+//   urlTranscriptionButton: {
+//     backgroundColor: '#0f186e',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderTopRightRadius: 12,
+//     borderBottomRightRadius: 12,
+//   },
+//   buttonText: {
+//     color: 'white',
+//     fontWeight: '600',
+//     letterSpacing: -0.5,
+//     fontSize: 16,
+//   },
+//   liveTranscriptionButton: {
+//     flexDirection: 'row',
+//     justifyContent: 'center',
+//     alignItems: 'center',
+//     padding: 12,
+//     borderRadius: 12,
+//     marginTop: 12,
+//     gap: 8,
+//   },
+//   backgroundRed: {
+//     backgroundColor: 'red',
+//   },
+//   backgroundBlue: {
+//     backgroundColor: '#0f186e',
+//   },
+//   disabled: {
+//     opacity: 0.5,
+//   },
+// });
+
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Text,
@@ -10,7 +307,12 @@ import {
   Platform,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  // Make sure Word is exported from your module
+  Word,
+} from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
   AudioManager,
@@ -28,7 +330,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  // CHANGE 1: Update state to hold Word[] instead of string
+  const [transcription, setTranscription] = useState<Word[]>([]);
+
   const [audioURL, setAudioURL] = useState('');
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
@@ -78,6 +382,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
+      // model.transcribe now returns Word[], which matches our state type
       setTranscription(await model.transcribe(audioBuffer));
     } catch (error) {
       console.error('Error decoding audio data', error);
@@ -88,7 +393,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    setTranscription('');
+    setTranscription([]); // Reset to empty array
     recorder.onAudioReady(({ buffer }) => {
       model.streamInsert(buffer.getChannelData(0));
     });
@@ -118,6 +423,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
+  // CHANGE 3: Prepare the text for rendering
+  const displayedText =
+    transcription.length > 0
+      ? getText(transcription)
+      : getText(model.committedTranscription) +
+        getText(model.nonCommittedTranscription);
+
   return (
     <SafeAreaProvider>
       <SafeAreaView style={styles.container}>
@@ -147,12 +459,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <Text>
-                {transcription !== ''
-                  ? transcription
-                  : model.committedTranscription +
-                    model.nonCommittedTranscription}
-              </Text>
+              <Text>{displayedText}</Text>
             </ScrollView>
           </View>
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index bf1147162..184b66a4f 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -21,6 +21,8 @@
 #include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
+using rnexecutorch::models::speech_to_text::types::Word;
+
 namespace rnexecutorch::jsi_conversion {
 
 using namespace facebook;
@@ -64,18 +66,24 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
 }
 
 template <>
-inline getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
-  jsi::Array jsiArr(rt, words.size());
-  for (size_t i = 0; i < words.size(); ++i) {
-    jsi::Object obj(rt);
-    obj.setProperty(rt, "word",
-                    jsi::String::createFromUtf8(rt, words[i].content));
-    obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-    obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-    jsiArr.setValueAtIndex(rt, i, obj);
-  }
-  return jsiArr;
-};
+inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
+  jsi::Object obj = val.asObject(runtime);
+  
+  // 1. Extract the string "word" using the existing string helper
+  std::string content = getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
+  
+  // 2. Extract start/end times
+  // We use .asNumber() directly as these are primitives
+  double start = obj.getProperty(runtime, "start").asNumber();
+  double end = obj.getProperty(runtime, "end").asNumber();
+
+  // 3. Construct and return the C++ Word struct
+  return Word{
+      .content = std::move(content),
+      .start = static_cast<float>(start),
+      .end = static_cast<float>(end)
+  };
+}
 
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
@@ -311,6 +319,23 @@ inline jsi::Value getJsiValue(std::shared_ptr<jsi::Object> valuePtr,
   return std::move(*valuePtr);
 }
 
+inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "word", jsi::String::createFromUtf8(runtime, word.content));
+  obj.setProperty(runtime, "start", static_cast<double>(word.start));
+  obj.setProperty(runtime, "end", static_cast<double>(word.end));
+  return obj;
+}
+
+inline jsi::Value getJsiValue(const std::vector<Word> &vec, jsi::Runtime &runtime) {
+  jsi::Array array(runtime, vec.size());
+  for (size_t i = 0; i < vec.size(); ++i) {
+    // Convert each Word using the helper above and place in array
+    array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
+  }
+  return {runtime, array};
+}
+
 inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
                               jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 815964aed..38210d4cb 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -26,6 +26,8 @@
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 
+using rnexecutorch::models::speech_to_text::types::Word;
+
 namespace rnexecutorch {
 
 template <typename Model> class ModelHostObject : public JsiHostObject {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index f026f30a9..a6ec82795 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -43,28 +43,28 @@ SpeechToText::decode(std::span<uint64_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-                                           std::string languageOption) const {
-  std::vector<Segment> segments =
-      this->asr->transcribe(waveform, DecodingOptions(languageOption));
-  std::string transcription;
-
-  size_t transcriptionLength = 0;
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcriptionLength += word.content.size();
-    }
-  }
-  transcription.reserve(transcriptionLength);
+// std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
+//                                            std::string languageOption) const {
+//   std::vector<Segment> segments =
+//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
+//   std::string transcription;
+
+//   size_t transcriptionLength = 0;
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcriptionLength += word.content.size();
+//     }
+//   }
+//   transcription.reserve(transcriptionLength);
 
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcription += word.content;
-    }
-  }
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcription += word.content;
+//     }
+//   }
 
-  return {transcription.begin(), transcription.end()};
-}
+//   return {transcription.begin(), transcription.end()};
+// }
 
 std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
                                            std::string languageOption) const {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index b8a7aced4..f62986b72 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -35,7 +35,11 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) {
   }
 
   std::deque<Word> nonCommittedWords = this->hypothesisBuffer.complete();
-  return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
+  // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
+  return {std::vector<Word>(std::make_move_iterator(flushed.begin()),
+                           std::make_move_iterator(flushed.end())), 
+                           std::vector<Word>(std::make_move_iterator(nonCommittedWords.begin()),
+                           std::make_move_iterator(nonCommittedWords.end()))};
 }
 
 void OnlineASRProcessor::chunkCompletedSegment(std::span<const Segment> res) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
index 0cb05e5a6..685ba2b76 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
@@ -4,9 +4,14 @@
 
 namespace rnexecutorch::models::speech_to_text::types {
 
+// struct ProcessResult {
+//   std::string committed;
+//   std::string nonCommitted;
+// };
+
 struct ProcessResult {
-  std::string committed;
-  std::string nonCommitted;
+  std::vector<Word> committed;
+  std::vector<Word> nonCommitted;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types

From 0dcff40c491f9b506d130e6587e9e414ad067b00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 15:36:31 +0100
Subject: [PATCH 04/49] Working version of both timestamping and regular
 version

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 378 ++++--------------
 .../host_objects/ModelHostObject.h            |   5 +
 .../models/speech_to_text/SpeechToText.cpp    | 109 +++--
 .../models/speech_to_text/SpeechToText.h      |   6 +-
 .../useSpeechToText.ts                        | 223 +++--------
 .../SpeechToTextModule.ts                     |  62 ++-
 .../react-native-executorch/src/types/stt.ts  |   1 +
 7 files changed, 243 insertions(+), 541 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 542d5dd01..1e12163f4 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -1,300 +1,3 @@
-// import React, { useEffect, useRef, useState } from 'react';
-// import {
-//   Text,
-//   View,
-//   StyleSheet,
-//   TouchableOpacity,
-//   ScrollView,
-//   TextInput,
-//   KeyboardAvoidingView,
-//   Platform,
-// } from 'react-native';
-// import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-// import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
-// import FontAwesome from '@expo/vector-icons/FontAwesome';
-// import {
-//   AudioManager,
-//   AudioRecorder,
-//   AudioContext,
-// } from 'react-native-audio-api';
-// import * as FileSystem from 'expo-file-system/legacy';
-// import SWMIcon from '../assets/swm_icon.svg';
-// import DeviceInfo from 'react-native-device-info';
-
-// const isSimulator = DeviceInfo.isEmulatorSync();
-
-// export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
-//   const model = useSpeechToText({
-//     model: WHISPER_TINY_EN,
-//   });
-
-//   const [transcription, setTranscription] = useState('');
-//   const [audioURL, setAudioURL] = useState('');
-//   const [liveTranscribing, setLiveTranscribing] = useState(false);
-//   const scrollViewRef = useRef<ScrollView>(null);
-
-//   const [recorder] = useState(
-//     () =>
-//       new AudioRecorder({
-//         sampleRate: 16000,
-//         bufferLengthInSamples: 1600,
-//       })
-//   );
-
-//   useEffect(() => {
-//     AudioManager.setAudioSessionOptions({
-//       iosCategory: 'playAndRecord',
-//       iosMode: 'spokenAudio',
-//       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
-//     });
-//     AudioManager.requestRecordingPermissions();
-//   }, []);
-
-//   const handleTranscribeFromURL = async () => {
-//     if (!audioURL.trim()) {
-//       console.warn('Please provide a valid audio file URL');
-//       return;
-//     }
-
-//     const { uri } = await FileSystem.downloadAsync(
-//       audioURL,
-//       FileSystem.cacheDirectory + 'audio_file'
-//     );
-
-//     const audioContext = new AudioContext({ sampleRate: 16000 });
-
-//     try {
-//       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-//       const audioBuffer = decodedAudioData.getChannelData(0);
-//       setTranscription(await model.transcribe(audioBuffer));
-//     } catch (error) {
-//       console.error('Error decoding audio data', error);
-//       console.warn('Note: Supported file formats: mp3, wav, flac');
-//       return;
-//     }
-//   };
-
-//   const handleStartTranscribeFromMicrophone = async () => {
-//     setLiveTranscribing(true);
-//     setTranscription('');
-//     recorder.onAudioReady(({ buffer }) => {
-//       model.streamInsert(buffer.getChannelData(0));
-//     });
-//     recorder.start();
-
-//     try {
-//       await model.stream();
-//     } catch (error) {
-//       console.error('Error during live transcription:', error);
-//     }
-//   };
-
-//   const handleStopTranscribeFromMicrophone = () => {
-//     recorder.stop();
-//     model.streamStop();
-//     console.log('Live transcription stopped');
-//     setLiveTranscribing(false);
-//   };
-
-//   const getModelStatus = () => {
-//     if (model.error) return `${model.error}`;
-//     if (model.isGenerating) return 'Transcribing...';
-//     if (model.isReady) return 'Ready to transcribe';
-//     return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`;
-//   };
-
-//   const readyToTranscribe = !model.isGenerating && model.isReady;
-//   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
-
-//   return (
-//     <SafeAreaProvider>
-//       <SafeAreaView style={styles.container}>
-//         <KeyboardAvoidingView
-//           style={styles.keyboardAvoidingView}
-//           behavior={Platform.OS === 'ios' ? 'padding' : undefined}
-//         >
-//           <View style={styles.header}>
-//             <TouchableOpacity style={styles.backButton} onPress={onBack}>
-//               <FontAwesome name="chevron-left" size={20} color="#0f186e" />
-//             </TouchableOpacity>
-//             <SWMIcon width={60} height={60} />
-//             <Text style={styles.headerText}>React Native ExecuTorch</Text>
-//             <Text style={styles.headerText}>Speech to Text</Text>
-//           </View>
-
-//           <View style={styles.statusContainer}>
-//             <Text>Status: {getModelStatus()}</Text>
-//           </View>
-
-//           <View style={styles.transcriptionContainer}>
-//             <Text style={styles.transcriptionLabel}>Transcription</Text>
-//             <ScrollView
-//               ref={scrollViewRef}
-//               style={styles.transcriptionScrollContainer}
-//               onContentSizeChange={() =>
-//                 scrollViewRef.current?.scrollToEnd({ animated: true })
-//               }
-//             >
-//               <Text>
-//                 {transcription !== ''
-//                   ? transcription
-//                   : model.committedTranscription +
-//                     model.nonCommittedTranscription}
-//               </Text>
-//             </ScrollView>
-//           </View>
-
-//           <View style={styles.inputContainer}>
-//             <View style={styles.urlTranscriptionContainer}>
-//               <TextInput
-//                 placeholder="Audio file URL to transcribe"
-//                 style={styles.urlTranscriptionInput}
-//                 value={audioURL}
-//                 onChangeText={setAudioURL}
-//               />
-//               <TouchableOpacity
-//                 disabled={!readyToTranscribe}
-//                 onPress={handleTranscribeFromURL}
-//                 style={[
-//                   styles.urlTranscriptionButton,
-//                   !readyToTranscribe && styles.disabled,
-//                 ]}
-//               >
-//                 <Text style={styles.buttonText}>Start</Text>
-//               </TouchableOpacity>
-//             </View>
-
-//             {liveTranscribing ? (
-//               <TouchableOpacity
-//                 onPress={handleStopTranscribeFromMicrophone}
-//                 style={[styles.liveTranscriptionButton, styles.backgroundRed]}
-//               >
-//                 <FontAwesome name="microphone-slash" size={22} color="white" />
-//                 <Text style={styles.buttonText}> Stop Live Transcription</Text>
-//               </TouchableOpacity>
-//             ) : (
-//               <TouchableOpacity
-//                 disabled={recordingButtonDisabled}
-//                 onPress={handleStartTranscribeFromMicrophone}
-//                 style={[
-//                   styles.liveTranscriptionButton,
-//                   styles.backgroundBlue,
-//                   recordingButtonDisabled && styles.disabled,
-//                 ]}
-//               >
-//                 <FontAwesome name="microphone" size={20} color="white" />
-//                 <Text style={styles.buttonText}>
-//                   {isSimulator
-//                     ? 'Recording is not available on Simulator'
-//                     : 'Start Live Transcription'}
-//                 </Text>
-//               </TouchableOpacity>
-//             )}
-//           </View>
-//         </KeyboardAvoidingView>
-//       </SafeAreaView>
-//     </SafeAreaProvider>
-//   );
-// };
-
-// const styles = StyleSheet.create({
-//   container: {
-//     flex: 1,
-//     alignItems: 'center',
-//     backgroundColor: 'white',
-//     paddingHorizontal: 16,
-//   },
-//   keyboardAvoidingView: {
-//     flex: 1,
-//     width: '100%',
-//   },
-//   header: {
-//     alignItems: 'center',
-//     position: 'relative',
-//     width: '100%',
-//   },
-//   backButton: {
-//     position: 'absolute',
-//     left: 0,
-//     top: 10,
-//     padding: 10,
-//     zIndex: 1,
-//   },
-//   headerText: {
-//     fontSize: 22,
-//     fontWeight: 'bold',
-//     color: '#0f186e',
-//   },
-//   statusContainer: {
-//     marginTop: 12,
-//     alignItems: 'center',
-//   },
-//   transcriptionContainer: {
-//     flex: 1,
-//     width: '100%',
-//     marginVertical: 12,
-//   },
-//   transcriptionLabel: {
-//     marginLeft: 12,
-//     marginBottom: 4,
-//     color: '#0f186e',
-//   },
-//   transcriptionScrollContainer: {
-//     borderRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     padding: 12,
-//   },
-//   inputContainer: {
-//     marginBottom: 12,
-//   },
-//   urlTranscriptionContainer: {
-//     width: '100%',
-//     flexDirection: 'row',
-//   },
-//   urlTranscriptionInput: {
-//     flex: 1,
-//     padding: 12,
-//     borderTopLeftRadius: 12,
-//     borderBottomLeftRadius: 12,
-//     borderWidth: 1,
-//     borderColor: '#0f186e',
-//     borderRightWidth: 0,
-//   },
-//   urlTranscriptionButton: {
-//     backgroundColor: '#0f186e',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderTopRightRadius: 12,
-//     borderBottomRightRadius: 12,
-//   },
-//   buttonText: {
-//     color: 'white',
-//     fontWeight: '600',
-//     letterSpacing: -0.5,
-//     fontSize: 16,
-//   },
-//   liveTranscriptionButton: {
-//     flexDirection: 'row',
-//     justifyContent: 'center',
-//     alignItems: 'center',
-//     padding: 12,
-//     borderRadius: 12,
-//     marginTop: 12,
-//     gap: 8,
-//   },
-//   backgroundRed: {
-//     backgroundColor: 'red',
-//   },
-//   backgroundBlue: {
-//     backgroundColor: '#0f186e',
-//   },
-//   disabled: {
-//     opacity: 0.5,
-//   },
-// });
-
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Text,
@@ -305,12 +8,12 @@ import {
   TextInput,
   KeyboardAvoidingView,
   Platform,
+  Switch, // Import Switch
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  // Make sure Word is exported from your module
   Word,
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
@@ -330,8 +33,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  // CHANGE 1: Update state to hold Word[] instead of string
-  const [transcription, setTranscription] = useState<Word[]>([]);
+  // CHANGE 1: State can now be string OR Word[]
+  const [transcription, setTranscription] = useState<string | Word[]>('');
+
+  // CHANGE 2: Add toggle for timestamps
+  const [enableTimestamps, setEnableTimestamps] = useState(false);
 
   const [audioURL, setAudioURL] = useState('');
   const [liveTranscribing, setLiveTranscribing] = useState(false);
@@ -370,6 +76,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   }
 
   const handleTranscribeFromURL = async () => {
+    console.log('[1] UI: Button Pressed. Calling model.stream()...');
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
       return;
@@ -382,8 +89,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
-      // model.transcribe now returns Word[], which matches our state type
-      setTranscription(await model.transcribe(audioBuffer));
+
+      // CHANGE 4: Pass the toggle flag to transcribe
+      // TypeScript will infer the return type based on the flag
+      if (enableTimestamps) {
+        const result = await model.transcribe(audioBuffer, {
+          enableTimestamps: true,
+        });
+        setTranscription(result);
+      } else {
+        const result = await model.transcribe(audioBuffer, {
+          enableTimestamps: false,
+        });
+        setTranscription(result);
+      }
     } catch (error) {
       console.error('Error decoding audio data', error);
       console.warn('Note: Supported file formats: mp3, wav, flac');
@@ -393,14 +112,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    setTranscription([]); // Reset to empty array
+    // Reset based on mode
+    setTranscription(enableTimestamps ? [] : '');
+
     recorder.onAudioReady(({ buffer }) => {
       model.streamInsert(buffer.getChannelData(0));
     });
     recorder.start();
 
     try {
-      await model.stream();
+      // CHANGE 5: Pass the toggle flag to stream
+      if (enableTimestamps) {
+        await model.stream({ enableTimestamps: true });
+      } else {
+        await model.stream({ enableTimestamps: false });
+      }
     } catch (error) {
       console.error('Error during live transcription:', error);
     }
@@ -423,12 +149,16 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // CHANGE 3: Prepare the text for rendering
-  const displayedText =
-    transcription.length > 0
-      ? getText(transcription)
-      : getText(model.committedTranscription) +
-        getText(model.nonCommittedTranscription);
+  // CHANGE 6: Logic to choose what text to display
+  // We use getText() on everything so it converts Arrays to Strings before concatenation
+  const hasResult = Array.isArray(transcription)
+    ? transcription.length > 0
+    : transcription.length > 0;
+
+  const displayedText = hasResult
+    ? getText(transcription)
+    : getText(model.committedTranscription) +
+      getText(model.nonCommittedTranscription);
 
   return (
     <SafeAreaProvider>
@@ -450,6 +180,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
             <Text>Status: {getModelStatus()}</Text>
           </View>
 
+          {/* CHANGE 7: Add UI for the Toggle */}
+          <View style={styles.toggleContainer}>
+            <Text style={styles.toggleLabel}>Enable Timestamps</Text>
+            <Switch
+              value={enableTimestamps}
+              onValueChange={(val) => {
+                setEnableTimestamps(val);
+                setTranscription(val ? [] : ''); // Reset transcription on toggle
+              }}
+              trackColor={{ false: '#767577', true: '#0f186e' }}
+              thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
+              disabled={model.isGenerating} // Disable changing mode while running
+            />
+          </View>
+
           <View style={styles.transcriptionContainer}>
             <Text style={styles.transcriptionLabel}>Transcription</Text>
             <ScrollView
@@ -517,6 +262,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 };
 
 const styles = StyleSheet.create({
+  // ... existing styles ...
   container: {
     flex: 1,
     alignItems: 'center',
@@ -548,6 +294,18 @@ const styles = StyleSheet.create({
     marginTop: 12,
     alignItems: 'center',
   },
+  // New style for the toggle
+  toggleContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginTop: 10,
+    marginBottom: 5,
+  },
+  toggleLabel: {
+    fontSize: 16,
+    marginRight: 10,
+    color: '#0f186e',
+  },
   transcriptionContainer: {
     flex: 1,
     width: '100%',
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 38210d4cb..eeebd4b97 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -75,6 +75,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                                        promiseHostFunction<&Model::transcribe>,
                                        "transcribe"));
 
+      addFunctions(
+          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                              promiseHostFunction<&Model::transcribeStringOnly>,
+                              "transcribeStringOnly"));
+
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::stream>,
                                        "stream"));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index a6ec82795..ad937b56c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -44,7 +44,8 @@ SpeechToText::decode(std::span<uint64_t> tokens,
 }
 
 // std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-//                                            std::string languageOption) const {
+//                                            std::string languageOption) const
+//                                            {
 //   std::vector<Segment> segments =
 //       this->asr->transcribe(waveform, DecodingOptions(languageOption));
 //   std::string transcription;
@@ -85,21 +86,47 @@ std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
     }
   }
 
-  auto wordsToJsi = [](jsi::Runtime &rt,
-                       const std::vector<Word> &words) -> jsi::Value {
-    jsi::Array jsiArr(rt, words.size());
-    for (size_t i = 0; i < words.size(); ++i) {
-      jsi::Object obj(rt);
-      obj.setProperty(rt, "word",
-                      jsi::String::createFromUtf8(rt, words[i].content));
-      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-      jsiArr.setValueAtIndex(rt, i, obj);
+  return transcription;
+}
+
+std::vector<char>
+SpeechToText::transcribeStringOnly(std::span<float> waveform,
+                                   std::string languageOption) const {
+  std::vector<Segment> segments =
+      this->asr->transcribe(waveform, DecodingOptions(languageOption));
+  std::string transcription;
+
+  size_t transcriptionLength = 0;
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcriptionLength += word.content.size();
     }
-    return jsiArr;
-  };
+  }
+  transcription.reserve(transcriptionLength);
 
-  return transcription;
+  for (auto &segment : segments) {
+    for (auto &word : segment.words) {
+      transcription += word.content;
+    }
+  }
+
+  return {transcription.begin(), transcription.end()};
+}
+
+std::vector<char> mergeWordsToString(const std::vector<Word> &words) {
+  std::string result;
+  size_t totalLength = 0;
+
+  for (const auto &word : words) {
+    totalLength += word.content.size();
+  }
+  result.reserve(totalLength);
+
+  for (const auto &word : words) {
+    result += word.content;
+  }
+
+  return {result.begin(), result.end()};
 }
 
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
@@ -153,38 +180,25 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
 // }
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption) {
+                          std::string languageOption, bool enableTimestamps) {
   if (this->isStreaming) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
   }
 
-  auto wordsToJsi = [](jsi::Runtime &rt,
-                       const std::vector<Word> &words) -> jsi::Value {
-    jsi::Array jsiArr(rt, words.size());
-    for (size_t i = 0; i < words.size(); ++i) {
-      jsi::Object obj(rt);
-      obj.setProperty(rt, "word",
-                      jsi::String::createFromUtf8(rt, words[i].content));
-      obj.setProperty(rt, "start", static_cast<double>(words[i].start));
-      obj.setProperty(rt, "end", static_cast<double>(words[i].end));
-      jsiArr.setValueAtIndex(rt, i, obj);
-    }
-    return jsiArr;
-  };
-
-  auto nativeCallback = [this, callback,
-                         wordsToJsi](const std::vector<Word> &committedVec,
-                                     const std::vector<Word> &nonCommittedVec,
-                                     bool isDone) {
-    this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
-                                    isDone, wordsToJsi](jsi::Runtime &rt) {
-      jsi::Value committedJsi = wordsToJsi(rt, committedVec);
-      jsi::Value nonCommittedJsi = wordsToJsi(rt, nonCommittedVec);
-
-      callback->call(rt, std::move(committedJsi), std::move(nonCommittedJsi),
-                     jsi::Value(isDone));
-    });
+  auto nativeCallback = [this, callback](const auto &committedVec,
+                                         const auto &nonCommittedVec,
+                                         bool isDone) {
+    this->callInvoker->invokeAsync(
+        [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) {
+          jsi::Value committedJsi =
+              rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt);
+          jsi::Value nonCommittedJsi =
+              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt);
+
+          callback->call(rt, std::move(committedJsi),
+                         std::move(nonCommittedJsi), jsi::Value(isDone));
+        });
   };
 
   this->isStreaming = true;
@@ -197,14 +211,23 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     ProcessResult res =
         this->processor->processIter(DecodingOptions(languageOption));
 
-    nativeCallback(res.committed, res.nonCommitted, false);
+    if (enableTimestamps) {
+      nativeCallback(res.committed, res.nonCommitted, false);
+    } else {
+      nativeCallback(mergeWordsToString(res.committed),
+                     mergeWordsToString(res.nonCommitted), false);
+    }
     this->readyToProcess = false;
   }
 
   // finish() now returns std::vector<Word>
   std::vector<Word> committed = this->processor->finish();
 
-  nativeCallback(committed, {}, true);
+  if (enableTimestamps) {
+    nativeCallback(committed, std::vector<Word>{}, true);
+  } else {
+    nativeCallback(mergeWordsToString(committed), std::vector<char>(), true);
+  }
 
   this->resetStreamState();
 }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index e206f6ca7..8b525cc2d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -26,11 +26,15 @@ class SpeechToText {
   [[nodiscard("Registered non-void function")]] std::vector<char>
   transcribe(std::span<float> waveform, std::string languageOption) const;
 
+  [[nodiscard("Registered non-void function")]]
+  std::vector<char> transcribeStringOnly(std::span<float> waveform,
+                                         std::string languageOption) const;
+
   size_t getMemoryLowerBound() const noexcept;
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption);
+              std::string languageOption, bool enableTimestamps);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 8285c918d..f9b5da8b1 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,141 +1,3 @@
-// import { useEffect, useCallback, useState } from 'react';
-// import { SpeechToTextModule, Word } from '../../modules/natural_language_processing/SpeechToTextModule';
-// import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
-// import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
-// import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
-
-// export const useSpeechToText = ({
-//   model,
-//   preventLoad = false,
-// }: {
-//   model: SpeechToTextModelConfig;
-//   preventLoad?: boolean;
-// }) => {
-//   const [error, setError] = useState<null | RnExecutorchError>(null);
-//   const [isReady, setIsReady] = useState(false);
-//   const [isGenerating, setIsGenerating] = useState(false);
-//   const [downloadProgress, setDownloadProgress] = useState(0);
-
-//   const [modelInstance] = useState(() => new SpeechToTextModule());
-//   const [committedTranscription, setCommittedTranscription] = useState(Word);
-//   const [nonCommittedTranscription, setNonCommittedTranscription] =
-//     useState(Word);
-
-//   useEffect(() => {
-//     if (preventLoad) return;
-//     (async () => {
-//       setDownloadProgress(0);
-//       setError(null);
-//       try {
-//         setIsReady(false);
-//         await modelInstance.load(
-//           {
-//             isMultilingual: model.isMultilingual,
-//             encoderSource: model.encoderSource,
-//             decoderSource: model.decoderSource,
-//             tokenizerSource: model.tokenizerSource,
-//           },
-//           setDownloadProgress
-//         );
-//         setIsReady(true);
-//       } catch (err) {
-//         setError(parseUnknownError(err));
-//       }
-//     })();
-//   }, [
-//     modelInstance,
-//     model.isMultilingual,
-//     model.encoderSource,
-//     model.decoderSource,
-//     model.tokenizerSource,
-//     preventLoad,
-//   ]);
-
-//   const stateWrapper = useCallback(
-//     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
-//       async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         if (isGenerating)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModelGenerating,
-//             'The model is currently generating. Please wait until previous model run is complete.'
-//           );
-//         setIsGenerating(true);
-//         try {
-//           return await fn.apply(modelInstance, args);
-//         } finally {
-//           setIsGenerating(false);
-//         }
-//       },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const stream = useCallback(
-//     async (options?: DecodingOptions) => {
-//       if (!isReady)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModuleNotLoaded,
-//           'The model is currently not loaded. Please load the model before calling this function.'
-//         );
-//       if (isGenerating)
-//         throw new RnExecutorchError(
-//           RnExecutorchErrorCode.ModelGenerating,
-//           'The model is currently generating. Please wait until previous model run is complete.'
-//         );
-//       setIsGenerating(true);
-//       setCommittedTranscription('');
-//       setNonCommittedTranscription('');
-//       let transcription = '';
-//       try {
-//         for await (const { committed, nonCommitted } of modelInstance.stream(
-//           options
-//         )) {
-//           setCommittedTranscription((prev) => prev + committed);
-//           setNonCommittedTranscription(nonCommitted);
-//           transcription += committed;
-//         }
-//       } finally {
-//         setIsGenerating(false);
-//       }
-//       return transcription;
-//     },
-//     [isReady, isGenerating, modelInstance]
-//   );
-
-//   const wrapper = useCallback(
-//     <T extends (...args: any[]) => any>(fn: T) => {
-//       return (...args: Parameters<T>): ReturnType<T> => {
-//         if (!isReady)
-//           throw new RnExecutorchError(
-//             RnExecutorchErrorCode.ModuleNotLoaded,
-//             'The model is currently not loaded. Please load the model before calling this function.'
-//           );
-//         return fn.apply(modelInstance, args);
-//       };
-//     },
-//     [isReady, modelInstance]
-//   );
-
-//   return {
-//     error,
-//     isReady,
-//     isGenerating,
-//     downloadProgress,
-//     committedTranscription,
-//     nonCommittedTranscription,
-//     encode: stateWrapper(SpeechToTextModule.prototype.encode),
-//     decode: stateWrapper(SpeechToTextModule.prototype.decode),
-//     transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
-//     stream,
-//     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
-//     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
-//   };
-// };
-
 import { useEffect, useCallback, useState } from 'react';
 import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
 import {
@@ -164,13 +26,13 @@ export const useSpeechToText = ({
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
 
-  // FIX 1: Initialize with empty array [], generic type Word[]
-  const [committedTranscription, setCommittedTranscription] = useState<Word[]>(
-    []
-  );
+  // FIX 1: Allow state to be either string or Word[]
+  const [committedTranscription, setCommittedTranscription] = useState<
+    string | Word[]
+  >('');
   const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
-    Word[]
-  >([]);
+    string | Word[]
+  >('');
 
   useEffect(() => {
     if (preventLoad) return;
@@ -193,14 +55,7 @@ export const useSpeechToText = ({
         setError(parseUnknownError(err));
       }
     })();
-  }, [
-    modelInstance,
-    model.isMultilingual,
-    model.encoderSource,
-    model.decoderSource,
-    model.tokenizerSource,
-    preventLoad,
-  ]);
+  }, [modelInstance, model, preventLoad]);
 
   const stateWrapper = useCallback(
     <T extends (...args: any[]) => Promise<any>>(fn: T) =>
@@ -208,12 +63,12 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
+            'The model is currently not loaded.'
           );
         if (isGenerating)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating. Please wait until previous model run is complete.'
+            'The model is currently generating.'
           );
         setIsGenerating(true);
         try {
@@ -226,38 +81,66 @@ export const useSpeechToText = ({
   );
 
   const stream = useCallback(
-    async (options?: DecodingOptions) => {
+    async (options?: DecodingOptions & { enableTimestamps?: boolean }) => {
+      console.log(
+        '[2] Hook: Stream called. Ready:',
+        isReady,
+        'Generating:',
+        isGenerating
+      );
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded. Please load the model before calling this function.'
+          'Model not loaded'
         );
       if (isGenerating)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating. Please wait until previous model run is complete.'
+          'Model is generating'
         );
+
       setIsGenerating(true);
 
-      // FIX 2: Reset to empty arrays
-      setCommittedTranscription([]);
-      setNonCommittedTranscription([]);
+      // FIX 2: Reset based on the mode requested
+      const enableTimestamps = options?.enableTimestamps ?? false;
+      setCommittedTranscription(enableTimestamps ? [] : '');
+      setNonCommittedTranscription(enableTimestamps ? [] : '');
 
-      // Accumulator is now an array of Words, not a string
-      const fullResult: Word[] = [];
+      let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
+        console.log('[3] Hook: Calling modelInstance.stream()');
+        // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
-          // FIX 3: Update state by appending arrays
-          if (committed.length > 0) {
-            setCommittedTranscription((prev) => [...prev, ...committed]);
-            fullResult.push(...committed);
+          console.log(committed, nonCommitted);
+          // FIX 3: Dynamic Merging Logic
+          if (typeof committed === 'string') {
+            // --- STRING MODE ---
+            if (committed.length > 0) {
+              setCommittedTranscription((prev) => {
+                // Safety check: if prev was somehow an array, reset it or cast to string
+                const prevStr = typeof prev === 'string' ? prev : '';
+                return prevStr + committed;
+              });
+              (fullResult as string) += committed;
+            }
+            setNonCommittedTranscription(nonCommitted as string);
+          } else {
+            // --- WORD[] MODE ---
+            const committedWords = committed as Word[];
+            const nonCommittedWords = nonCommitted as Word[];
+
+            if (committedWords.length > 0) {
+              setCommittedTranscription((prev) => {
+                const prevArr = Array.isArray(prev) ? prev : [];
+                return [...prevArr, ...committedWords];
+              });
+              (fullResult as Word[]).push(...committedWords);
+            }
+            setNonCommittedTranscription(nonCommittedWords);
           }
-
-          // nonCommitted is always a fresh partial chunk
-          setNonCommittedTranscription(nonCommitted);
         }
       } finally {
         setIsGenerating(false);
@@ -273,7 +156,7 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
+            'Model not loaded'
           );
         return fn.apply(modelInstance, args);
       };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 803e4146a..b52f49b4b 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -12,7 +12,10 @@ export class SpeechToTextModule {
   private nativeModule: any;
   private modelConfig!: SpeechToTextModelConfig;
 
-  // 2. TextDecoder is removed as C++ now returns JS objects directly
+  private textDecoder = new TextDecoder('utf-8', {
+    fatal: false,
+    ignoreBOM: true,
+  });
 
   /**
    * Loads the model specified by the config object.
@@ -101,7 +104,7 @@ export class SpeechToTextModule {
   public async transcribe(
     waveform: Float32Array,
     options: DecodingOptions = {}
-  ): Promise<Word[]> {
+  ): Promise<string | Word[]> {
     this.validateOptions(options);
     const transcriptionBytes = await this.nativeModule.transcribe(
       waveform,
@@ -125,10 +128,21 @@ export class SpeechToTextModule {
    */
   public async *stream(
     options: DecodingOptions = {}
-  ): AsyncGenerator<{ committed: Word[]; nonCommitted: Word[] }> {
+  ): AsyncGenerator<{
+    committed: string | Word[];
+    nonCommitted: string | Word[];
+  }> {
+    console.log('[4] Module: Entered stream method');
     this.validateOptions(options);
 
-    const queue: { committed: Word[]; nonCommitted: Word[] }[] = [];
+    // Ensure we strictly default to false
+    const enableTimestamps = options.enableTimestamps === true;
+
+    const queue: {
+      committed: string | Word[];
+      nonCommitted: string | Word[];
+    }[] = [];
+
     let waiter: (() => void) | null = null;
     let finished = false;
     let error: unknown;
@@ -140,20 +154,34 @@ export class SpeechToTextModule {
 
     (async () => {
       try {
-        await this.nativeModule.stream(
-          // Callback now receives arrays of objects directly
-          (committed: Word[], nonCommitted: Word[], isDone: boolean) => {
-            queue.push({
-              committed,
-              nonCommitted,
-            });
-            if (isDone) {
-              finished = true;
+        const callback = (
+          committed: any,
+          nonCommitted: any,
+          isDone: boolean
+        ) => {
+          if (!enableTimestamps) {
+            try {
+              queue.push({
+                committed: this.textDecoder.decode(new Uint8Array(committed)),
+                nonCommitted: this.textDecoder.decode(
+                  new Uint8Array(nonCommitted)
+                ),
+              });
+            } catch (err) {
+              console.error('[Stream Decode Error]', err);
             }
-            wake();
-          },
-          options.language || ''
-        );
+          } else {
+            queue.push({ committed, nonCommitted });
+          }
+
+          if (isDone) finished = true;
+          wake();
+        };
+
+        const language = options.language || '';
+
+        await this.nativeModule.stream(callback, language, enableTimestamps);
+
         finished = true;
         wake();
       } catch (e) {
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index ece8d6020..cfabb2313 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -195,6 +195,7 @@ export type SpeechToTextLanguage =
  */
 export interface DecodingOptions {
   language?: SpeechToTextLanguage;
+  enableTimestamps?: boolean;
 }
 
 /**

From 2d119d21dd7d950e5563ad6cc96bd132e3d73f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 15:54:28 +0100
Subject: [PATCH 05/49] Clear files

---
 apps/speech/screens/SpeechToTextScreen.tsx    | 23 ++----
 .../host_objects/JsiConversions.h             |  4 --
 .../models/speech_to_text/SpeechToText.cpp    | 70 -------------------
 .../SpeechToTextModule.ts                     |  2 -
 4 files changed, 5 insertions(+), 94 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 1e12163f4..f513dbc06 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -8,7 +8,7 @@ import {
   TextInput,
   KeyboardAvoidingView,
   Platform,
-  Switch, // Import Switch
+  Switch,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
@@ -33,10 +33,8 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  // CHANGE 1: State can now be string OR Word[]
   const [transcription, setTranscription] = useState<string | Word[]>('');
 
-  // CHANGE 2: Add toggle for timestamps
   const [enableTimestamps, setEnableTimestamps] = useState(false);
 
   const [audioURL, setAudioURL] = useState('');
@@ -76,7 +74,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   }
 
   const handleTranscribeFromURL = async () => {
-    console.log('[1] UI: Button Pressed. Calling model.stream()...');
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
       return;
@@ -94,12 +91,12 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: true,
+          enableTimestamps: true
         });
         setTranscription(result);
       } else {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: false,
+          enableTimestamps: false
         });
         setTranscription(result);
       }
@@ -112,7 +109,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    // Reset based on mode
     setTranscription(enableTimestamps ? [] : '');
 
     recorder.onAudioReady(({ buffer }) => {
@@ -121,12 +117,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.start();
 
     try {
-      // CHANGE 5: Pass the toggle flag to stream
-      if (enableTimestamps) {
-        await model.stream({ enableTimestamps: true });
-      } else {
-        await model.stream({ enableTimestamps: false });
-      }
+      await model.stream({ enableTimestamps: enableTimestamps });
     } catch (error) {
       console.error('Error during live transcription:', error);
     }
@@ -149,11 +140,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // CHANGE 6: Logic to choose what text to display
-  // We use getText() on everything so it converts Arrays to Strings before concatenation
-  const hasResult = Array.isArray(transcription)
-    ? transcription.length > 0
-    : transcription.length > 0;
+  const hasResult = transcription.length > 0;
 
   const displayedText = hasResult
     ? getText(transcription)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 184b66a4f..9e2df2fc8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -69,15 +69,11 @@ template <>
 inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
   jsi::Object obj = val.asObject(runtime);
   
-  // 1. Extract the string "word" using the existing string helper
   std::string content = getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
   
-  // 2. Extract start/end times
-  // We use .asNumber() directly as these are primitives
   double start = obj.getProperty(runtime, "start").asNumber();
   double end = obj.getProperty(runtime, "end").asNumber();
 
-  // 3. Construct and return the C++ Word struct
   return Word{
       .content = std::move(content),
       .start = static_cast<float>(start),
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index ad937b56c..04b242454 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -43,30 +43,6 @@ SpeechToText::decode(std::span<uint64_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-// std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-//                                            std::string languageOption) const
-//                                            {
-//   std::vector<Segment> segments =
-//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
-//   std::string transcription;
-
-//   size_t transcriptionLength = 0;
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcriptionLength += word.content.size();
-//     }
-//   }
-//   transcription.reserve(transcriptionLength);
-
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcription += word.content;
-//     }
-//   }
-
-//   return {transcription.begin(), transcription.end()};
-// }
-
 std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
                                            std::string languageOption) const {
   std::vector<Segment> segments =
@@ -134,51 +110,6 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
          this->decoder->getMemoryLowerBound();
 }
 
-// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-//                           std::string languageOption) {
-//   if (this->isStreaming) {
-//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
-//                             "Streaming is already in progress!");
-//   }
-
-//   auto nativeCallback =
-//       [this, callback](const std::vector<char> &committedVec,
-//                        const std::vector<char> &nonCommittedVec, bool isDone)
-//                        {
-//         this->callInvoker->invokeAsync([callback, committedVec,
-//         nonCommittedVec,
-//                                         isDone](jsi::Runtime &rt) {
-//           callback->call(
-//               rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec,
-//               rt), rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec,
-//               rt), jsi::Value(isDone));
-//         });
-//       };
-
-//   this->isStreaming = true;
-//   while (this->isStreaming) {
-//     if (!this->readyToProcess ||
-//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
-//         {
-//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
-//       continue;
-//     }
-//     ProcessResult res =
-//         this->processor->processIter(DecodingOptions(languageOption));
-
-//     nativeCallback({res.committed.begin(), res.committed.end()},
-//                    {res.nonCommitted.begin(), res.nonCommitted.end()},
-//                    false);
-//     this->readyToProcess = false;
-//   }
-
-//   std::string committed = this->processor->finish();
-
-//   nativeCallback({committed.begin(), committed.end()}, {}, true);
-
-//   this->resetStreamState();
-// }
-
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption, bool enableTimestamps) {
   if (this->isStreaming) {
@@ -220,7 +151,6 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     this->readyToProcess = false;
   }
 
-  // finish() now returns std::vector<Word>
   std::vector<Word> committed = this->processor->finish();
 
   if (enableTimestamps) {
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index b52f49b4b..cc5669b27 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -132,10 +132,8 @@ export class SpeechToTextModule {
     committed: string | Word[];
     nonCommitted: string | Word[];
   }> {
-    console.log('[4] Module: Entered stream method');
     this.validateOptions(options);
 
-    // Ensure we strictly default to false
     const enableTimestamps = options.enableTimestamps === true;
 
     const queue: {

From 30b76cf5f59dd7cfa315517f2b7d1cb358d077b7 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 21 Jan 2026 16:36:46 +0100
Subject: [PATCH 06/49] Apply suggestions from code review

---
 apps/speech/screens/SpeechToTextScreen.tsx | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index f513dbc06..735e66d78 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -91,12 +91,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: true
-        });
-        setTranscription(result);
-      } else {
-        const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: false
+          enableTimestamps: enableTimestamps
         });
         setTranscription(result);
       }
@@ -167,18 +162,17 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
             <Text>Status: {getModelStatus()}</Text>
           </View>
 
-          {/* CHANGE 7: Add UI for the Toggle */}
           <View style={styles.toggleContainer}>
             <Text style={styles.toggleLabel}>Enable Timestamps</Text>
             <Switch
               value={enableTimestamps}
               onValueChange={(val) => {
                 setEnableTimestamps(val);
-                setTranscription(val ? [] : ''); // Reset transcription on toggle
+                setTranscription(val ? [] : '');
               }}
               trackColor={{ false: '#767577', true: '#0f186e' }}
               thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
-              disabled={model.isGenerating} // Disable changing mode while running
+              disabled={model.isGenerating}
             />
           </View>
 

From 084cf1efced8710cbaa564753d05fb90f6916c99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 16:46:49 +0100
Subject: [PATCH 07/49] Apply further clearing

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  4 ---
 .../host_objects/JsiConversions.h             |  1 -
 .../host_objects/ModelHostObject.h            |  3 ---
 .../stream/OnlineASRProcessor.cpp             | 25 ++++++-------------
 .../stream/OnlineASRProcessor.h               |  3 ---
 .../speech_to_text/types/ProcessResult.h      |  5 ----
 .../useSpeechToText.ts                        | 14 -----------
 7 files changed, 7 insertions(+), 48 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 735e66d78..5fba8d055 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -87,8 +87,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
-      // CHANGE 4: Pass the toggle flag to transcribe
-      // TypeScript will infer the return type based on the flag
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
           enableTimestamps: enableTimestamps
@@ -243,7 +241,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 };
 
 const styles = StyleSheet.create({
-  // ... existing styles ...
   container: {
     flex: 1,
     alignItems: 'center',
@@ -275,7 +272,6 @@ const styles = StyleSheet.create({
     marginTop: 12,
     alignItems: 'center',
   },
-  // New style for the toggle
   toggleContainer: {
     flexDirection: 'row',
     alignItems: 'center',
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 9e2df2fc8..d9d49b6b4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -326,7 +326,6 @@ inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
 inline jsi::Value getJsiValue(const std::vector<Word> &vec, jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
   for (size_t i = 0; i < vec.size(); ++i) {
-    // Convert each Word using the helper above and place in array
     array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
   }
   return {runtime, array};
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index eeebd4b97..50797417e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -20,14 +20,11 @@
 #include <rnexecutorch/models/llm/LLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
-#include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
 #include <rnexecutorch/models/text_to_speech/TextToSpeech.h>
 #include <rnexecutorch/models/vertical_ocr/VerticalOCR.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 
-using rnexecutorch::models::speech_to_text::types::Word;
-
 namespace rnexecutorch {
 
 template <typename Model> class ModelHostObject : public JsiHostObject {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index f62986b72..3137d274b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -34,12 +34,14 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) {
     chunkCompletedSegment(res);
   }
 
+  auto move_to_vector = [](auto& container) {
+      return std::vector<Word>(std::make_move_iterator(container.begin()),
+                              std::make_move_iterator(container.end()));
+  };
+
   std::deque<Word> nonCommittedWords = this->hypothesisBuffer.complete();
-  // return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
-  return {std::vector<Word>(std::make_move_iterator(flushed.begin()),
-                           std::make_move_iterator(flushed.end())), 
-                           std::vector<Word>(std::make_move_iterator(nonCommittedWords.begin()),
-                           std::make_move_iterator(nonCommittedWords.end()))};
+
+  return { move_to_vector(flushed), move_to_vector(nonCommittedWords) };
 }
 
 void OnlineASRProcessor::chunkCompletedSegment(std::span<const Segment> res) {
@@ -86,22 +88,9 @@ std::vector<Word> OnlineASRProcessor::finish() {
   std::vector<Word> buffer(std::make_move_iterator(bufferDeq.begin()),
                            std::make_move_iterator(bufferDeq.end()));
 
-  // std::string committedText = this->toFlush(buffer);
   this->bufferTimeOffset += static_cast<float>(audioBuffer.size()) /
                             OnlineASRProcessor::kSamplingRate;
   return buffer;
 }
 
-// std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const
-// {
-//   std::string text;
-//   text.reserve(std::accumulate(
-//       words.cbegin(), words.cend(), 0,
-//       [](size_t sum, const Word &w) { return sum + w.content.size(); }));
-//   for (const auto &word : words) {
-//     text.append(word.content);
-//   }
-//   return text;
-// }
-
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index 720e6bf76..3abaad3b6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -12,7 +12,6 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  // std::string finish();
   std::vector<Word> finish();
 
   std::vector<float> audioBuffer;
@@ -27,8 +26,6 @@ class OnlineASRProcessor {
 
   void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
-
-  // std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
index 685ba2b76..681495e2a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
@@ -4,11 +4,6 @@
 
 namespace rnexecutorch::models::speech_to_text::types {
 
-// struct ProcessResult {
-//   std::string committed;
-//   std::string nonCommitted;
-// };
-
 struct ProcessResult {
   std::vector<Word> committed;
   std::vector<Word> nonCommitted;
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index f9b5da8b1..f9af79a54 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -26,7 +26,6 @@ export const useSpeechToText = ({
 
   const [modelInstance] = useState(() => new SpeechToTextModule());
 
-  // FIX 1: Allow state to be either string or Word[]
   const [committedTranscription, setCommittedTranscription] = useState<
     string | Word[]
   >('');
@@ -82,12 +81,6 @@ export const useSpeechToText = ({
 
   const stream = useCallback(
     async (options?: DecodingOptions & { enableTimestamps?: boolean }) => {
-      console.log(
-        '[2] Hook: Stream called. Ready:',
-        isReady,
-        'Generating:',
-        isGenerating
-      );
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
@@ -101,7 +94,6 @@ export const useSpeechToText = ({
 
       setIsGenerating(true);
 
-      // FIX 2: Reset based on the mode requested
       const enableTimestamps = options?.enableTimestamps ?? false;
       setCommittedTranscription(enableTimestamps ? [] : '');
       setNonCommittedTranscription(enableTimestamps ? [] : '');
@@ -109,18 +101,13 @@ export const useSpeechToText = ({
       let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
-        console.log('[3] Hook: Calling modelInstance.stream()');
-        // @ts-ignore - Typescript struggles with the dual generator return type, but logic is safe
         for await (const { committed, nonCommitted } of modelInstance.stream(
           options
         )) {
           console.log(committed, nonCommitted);
-          // FIX 3: Dynamic Merging Logic
           if (typeof committed === 'string') {
-            // --- STRING MODE ---
             if (committed.length > 0) {
               setCommittedTranscription((prev) => {
-                // Safety check: if prev was somehow an array, reset it or cast to string
                 const prevStr = typeof prev === 'string' ? prev : '';
                 return prevStr + committed;
               });
@@ -128,7 +115,6 @@ export const useSpeechToText = ({
             }
             setNonCommittedTranscription(nonCommitted as string);
           } else {
-            // --- WORD[] MODE ---
             const committedWords = committed as Word[];
             const nonCommittedWords = nonCommitted as Word[];
 

From 24c4606729637f6804669bc869780f223ce0c52e Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 21 Jan 2026 17:15:36 +0100
Subject: [PATCH 08/49] Apply suggestion from @msluszniak

---
 .../hooks/natural_language_processing/useSpeechToText.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index f9af79a54..17d05962d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -54,7 +54,14 @@ export const useSpeechToText = ({
         setError(parseUnknownError(err));
       }
     })();
-  }, [modelInstance, model, preventLoad]);
+  }, [
+    modelInstance,
+    model.isMultilingual,
+    model.encoderSource,
+    model.decoderSource,
+    model.tokenizerSource,
+    preventLoad,
+  ]);
 
   const stateWrapper = useCallback(
     <T extends (...args: any[]) => Promise<any>>(fn: T) =>

From 8b019fe1df6fb17e7804fde690e1bface6048a49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 17:31:07 +0100
Subject: [PATCH 09/49] Apply autofix lint changes

---
 apps/speech/screens/SpeechToTextScreen.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 5fba8d055..b1693968b 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -89,7 +89,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
       if (enableTimestamps) {
         const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: enableTimestamps
+          enableTimestamps: enableTimestamps,
         });
         setTranscription(result);
       }

From 5eab00da023019fb99c0959acd8c3b1c2287124c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 17:57:16 +0100
Subject: [PATCH 10/49] Fix linter issues

---
 apps/llm/app/voice_chat/index.tsx             | 14 ++++++++++--
 .../useSpeechToText.ts                        | 22 +++++++++++++------
 .../SpeechToTextModule.ts                     |  2 +-
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
index 79a713c93..0bf4c9b30 100644
--- a/apps/llm/app/voice_chat/index.tsx
+++ b/apps/llm/app/voice_chat/index.tsx
@@ -76,7 +76,11 @@ function VoiceChatScreen() {
       });
       recorder.start();
       const transcription = await speechToText.stream();
-      await llm.sendMessage(transcription);
+      await llm.sendMessage(
+        typeof transcription === 'string'
+          ? transcription
+          : transcription.map((w) => w.word).join(' ')
+      );
     }
   };
 
@@ -105,7 +109,13 @@ function VoiceChatScreen() {
                       ...llm.messageHistory,
                       {
                         role: 'user',
-                        content: speechToText.committedTranscription,
+                        content:
+                          typeof speechToText.committedTranscription ===
+                          'string'
+                            ? speechToText.committedTranscription
+                            : speechToText.committedTranscription
+                                .map((w) => w.word)
+                                .join(' '),
                       },
                     ]
                   : llm.messageHistory
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 17d05962d..083cdaf2d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -108,11 +108,17 @@ export const useSpeechToText = ({
       let fullResult: string | Word[] = enableTimestamps ? [] : '';
 
       try {
-        for await (const { committed, nonCommitted } of modelInstance.stream(
-          options
-        )) {
-          console.log(committed, nonCommitted);
+        const streamGen = modelInstance.stream(
+          options as any
+        ) as AsyncGenerator<{
+          committed: string | Word[];
+          nonCommitted: string | Word[];
+        }>;
+
+        for await (const { committed, nonCommitted } of streamGen) {
           if (typeof committed === 'string') {
+            const nc = nonCommitted as unknown as string;
+
             if (committed.length > 0) {
               setCommittedTranscription((prev) => {
                 const prevStr = typeof prev === 'string' ? prev : '';
@@ -120,12 +126,12 @@ export const useSpeechToText = ({
               });
               (fullResult as string) += committed;
             }
-            setNonCommittedTranscription(nonCommitted as string);
+            setNonCommittedTranscription(nc);
           } else {
             const committedWords = committed as Word[];
             const nonCommittedWords = nonCommitted as Word[];
 
-            if (committedWords.length > 0) {
+            if (committedWords && committedWords.length > 0) {
               setCommittedTranscription((prev) => {
                 const prevArr = Array.isArray(prev) ? prev : [];
                 return [...prevArr, ...committedWords];
@@ -166,7 +172,9 @@ export const useSpeechToText = ({
     nonCommittedTranscription,
     encode: stateWrapper(SpeechToTextModule.prototype.encode),
     decode: stateWrapper(SpeechToTextModule.prototype.decode),
-    transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+    transcribe: stateWrapper(
+      SpeechToTextModule.prototype.transcribe
+    ) as SpeechToTextModule['transcribe'],
     stream,
     streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
     streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index cc5669b27..5891e4cd5 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -166,7 +166,7 @@ export class SpeechToTextModule {
                 ),
               });
             } catch (err) {
-              console.error('[Stream Decode Error]', err);
+              Logger.error('[Stream Decode Error]', err);
             }
           } else {
             queue.push({ committed, nonCommitted });

From db68c22089974f663128f05188053501084361ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 18:40:42 +0100
Subject: [PATCH 11/49] Revert changing error messages

---
 .../hooks/natural_language_processing/useSpeechToText.ts  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 083cdaf2d..9df9a88b1 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -69,12 +69,12 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded.'
+            'The model is currently not loaded. Please load the model before calling this function.'
           );
         if (isGenerating)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating.'
+            'The model is currently generating. Please wait until previous model run is complete.'
           );
         setIsGenerating(true);
         try {
@@ -91,12 +91,12 @@ export const useSpeechToText = ({
       if (!isReady)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'Model not loaded'
+          'The model is currently not loaded. Please load the model before calling this function.'
         );
       if (isGenerating)
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'Model is generating'
+          'The model is currently generating. Please wait until previous model run is complete.'
         );
 
       setIsGenerating(true);

From 2a69753ec44bcf68d1a97f9fc6ebce37dc7bfd70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 18:42:25 +0100
Subject: [PATCH 12/49] Revert one more message

---
 .../src/hooks/natural_language_processing/useSpeechToText.ts    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 9df9a88b1..611ec3153 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -155,7 +155,7 @@ export const useSpeechToText = ({
         if (!isReady)
           throw new RnExecutorchError(
             RnExecutorchErrorCode.ModuleNotLoaded,
-            'Model not loaded'
+            'The model is currently not loaded. Please load the model before calling this function.'
           );
         return fn.apply(modelInstance, args);
       };

From 11e01e837c00808420d02a6fdc5d67dcb2130308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 19:42:44 +0100
Subject: [PATCH 13/49] Update docs

---
 .../useSpeechToText.md                        | 71 +++++++++++++++----
 .../SpeechToTextModule.md                     | 27 ++++++-
 2 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 5b0545cf2..85c049f9c 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -83,6 +83,7 @@ Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechTo
 
 To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md).
 
+
 ## Running the model
 
 Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails.
@@ -101,12 +102,25 @@ const model = useSpeechToText({
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps
+
+You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects.
+
+```typescript
+const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...]
+```
+
 ## Example
 
 ```tsx
 import React, { useState } from 'react';
-import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import { Button, Text, View } from 'react-native';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  Word,
+} from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
@@ -115,7 +129,7 @@ function App() {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  const [transcription, setTranscription] = useState<string | Word[]>('');
 
   const loadAudio = async () => {
     const { uri } = await FileSystem.downloadAsync(
@@ -132,14 +146,38 @@ function App() {
 
   const handleTranscribe = async () => {
     const audio = await loadAudio();
-    await model.transcribe(audio);
+    // Default text transcription
+    const result = await model.transcribe(audio);
+    setTranscription(result);
+  };
+
+  const handleTranscribeWithTimestamps = async () => {
+    const audio = await loadAudio();
+    // Transcription with timestamps
+    const result = await model.transcribe(audio, { enableTimestamps: true });
+    setTranscription(result);
+  };
+
+  const renderContent = () => {
+    if (typeof transcription === 'string') {
+      return <Text>{transcription}</Text>;
+    }
+    return transcription.map((w, i) => (
+      <Text key={i}>
+        {w.word} ({w.start.toFixed(2)}s)
+      </Text>
+    ));
   };
 
   return (
-    <>
-      <Text>{transcription}</Text>
-      <Button onPress={handleTranscribe} title="Transcribe" />
-    </>
+    <View>
+      {renderContent()}
+      <Button onPress={handleTranscribe} title="Transcribe (Text)" />
+      <Button
+        onPress={handleTranscribeWithTimestamps}
+        title="Transcribe (Timestamps)"
+      />
+    </View>
   );
 }
 ```
@@ -148,7 +186,7 @@ function App() {
 
 ```tsx
 import React, { useEffect, useState } from 'react';
-import { Text, Button } from 'react-native';
+import { Text, Button, View } from 'react-native';
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
@@ -182,6 +220,7 @@ function App() {
     recorder.start();
 
     try {
+      // Pass { enableTimestamps: true } here if you want Word[] updates
       await model.stream();
     } catch (error) {
       console.error('Error during streaming transcription:', error);
@@ -193,18 +232,24 @@ function App() {
     model.streamStop();
   };
 
+  // Helper to safely render mixed types
+  const getText = (data: string | any[]) => {
+    if (typeof data === 'string') return data;
+    return data.map((w) => w.word).join('');
+  };
+
   return (
-    <>
+    <View>
       <Text>
-        {model.committedTranscription}
-        {model.nonCommittedTranscription}
+        {getText(model.committedTranscription)}
+        {getText(model.nonCommittedTranscription)}
       </Text>
       <Button
         onPress={handleStartStreamingTranscribe}
         title="Start Streaming"
       />
       <Button onPress={handleStopStreamingTranscribe} title="Stop Streaming" />
-    </>
+    </View>
   );
 }
 ```
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index 0698cb548..34d9633d4 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -19,7 +19,13 @@ await model.load(WHISPER_TINY_EN, (progress) => {
   console.log(progress);
 });
 
-await model.transcribe(waveform);
+// Standard transcription (returns string)
+const text = await model.transcribe(waveform);
+
+// Transcription with timestamps (returns Word[])
+const textWithTimestamps = await model.transcribe(waveform, {
+  enableTimestamps: true,
+});
 ```
 
 ### Methods
@@ -70,6 +76,15 @@ await model.load(WHISPER_TINY, (progress) => {
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps
+
+To get word-level timestamps, set `enableTimestamps` to `true`.
+
+```typescript
+const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+// words: [{ word: "Hello", start: 0.0, end: 0.5 }, ...]
+```
+
 ## Example
 
 ### Transcription
@@ -95,8 +110,13 @@ const audioBuffer = decodedAudioData.getChannelData(0);
 
 // Transcribe the audio
 try {
-  const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription);
+  // Option 1: Text only
+  const text = await model.transcribe(audioBuffer);
+  console.log('Text:', text);
+
+  // Option 2: With timestamps
+  const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
+  console.log('Words:', words);
 } catch (error) {
   console.error('Error during audio transcription', error);
 }
@@ -136,6 +156,7 @@ recorder.start();
 // Start streaming transcription
 try {
   let transcription = '';
+  // Note: Pass { enableTimestamps: true } here to get Word[] objects instead
   for await (const { committed, nonCommitted } of model.stream()) {
     console.log('Streaming transcription:', { committed, nonCommitted });
     transcription += committed;

From 3a823330a77a23399b6ccb218f9ae87b19d8e47a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 21 Jan 2026 20:08:18 +0100
Subject: [PATCH 14/49] Fix error in demo app

---
 apps/speech/screens/SpeechToTextScreen.tsx | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index b1693968b..b8f8cc63f 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -87,12 +87,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
-      if (enableTimestamps) {
-        const result = await model.transcribe(audioBuffer, {
-          enableTimestamps: enableTimestamps,
-        });
-        setTranscription(result);
-      }
+      const result = await model.transcribe(audioBuffer, {
+        enableTimestamps: enableTimestamps as any,
+      });
+      setTranscription(result);
     } catch (error) {
       console.error('Error decoding audio data', error);
       console.warn('Note: Supported file formats: mp3, wav, flac');

From 861c0857bf5e0bf8011ad0ba7b5fdc8a75d1e90b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 4 Feb 2026 12:05:11 +0100
Subject: [PATCH 15/49] chore: Add non-completed draft for transcription with
 timestamping utilizing OpenAI output format

---
 .../host_objects/JsiConversions.h             | 73 ++++++++++++++++---
 .../models/speech_to_text/asr/ASR.cpp         | 36 ++++++++-
 .../speech_to_text/types/DecodingOptions.h    |  8 +-
 .../models/speech_to_text/types/Segment.h     |  7 +-
 .../types/TranscriptionResult.h               | 16 ++++
 .../react-native-executorch/src/types/stt.ts  | 23 +++++-
 6 files changed, 142 insertions(+), 21 deletions(-)
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index d9d49b6b4..b2b888a88 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -503,19 +503,68 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
   return jsiDetections;
 }
 
-inline jsi::Value
-getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
-                &speechSegments,
-            jsi::Runtime &runtime) {
-  auto jsiSegments = jsi::Array(runtime, speechSegments.size());
-  for (size_t i = 0; i < speechSegments.size(); i++) {
-    const auto &[start, end] = speechSegments[i];
-    auto jsiSegmentObject = jsi::Object(runtime);
-    jsiSegmentObject.setProperty(runtime, "start", static_cast<int>(start));
-    jsiSegmentObject.setProperty(runtime, "end", static_cast<int>(end));
-    jsiSegments.setValueAtIndex(runtime, i, jsiSegmentObject);
+// inline jsi::Value
+// getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
+//                 &speechSegments,
+//             jsi::Runtime &runtime) {
+//   auto jsiSegments = jsi::Array(runtime, speechSegments.size());
+//   for (size_t i = 0; i < speechSegments.size(); i++) {
+//     const auto &[start, end] = speechSegments[i];
+//     auto jsiSegmentObject = jsi::Object(runtime);
+//     jsiSegmentObject.setProperty(runtime, "start", static_cast<int>(start));
+//     jsiSegmentObject.setProperty(runtime, "end", static_cast<int>(end));
+//     jsiSegments.setValueAtIndex(runtime, i, jsiSegmentObject);
+//   }
+//   return jsiSegments;
+// }
+
+inline jsi::Value getJsiValue(const TranscriptionResult &result, bool verbose, jsi::Runtime &runtime) {
+  Object obj(runtime);
+  obj.setProperty(runtime, "text", String::createFromUtf8(runtime, result.text));
+
+  // If not verbose, return object with just "text"
+  if (!verbose) {
+    return obj; 
+  }
+
+  obj.setProperty(runtime, "task", String::createFromUtf8(runtime, result.task));
+  obj.setProperty(runtime, "language", String::createFromUtf8(runtime, result.language));
+  obj.setProperty(runtime, "duration", result.duration);
+
+  jsi::Array segmentsAry(runtime, result.segments.size());
+  for (size_t i = 0; i < result.segments.size(); ++i) {
+    segmentsAry.setValueAtIndex(runtime, i, getJsiValue(runtime, result.segments[i], i));
+  }
+  obj.setProperty(runtime, "segments", segmentsAry);
+
+  return obj;
+}
+
+inline jsi::Value getJsiValue(const Segment &seg, int id, jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "id", id);
+  obj.setProperty(runtime, "seek", 0); // Simplified
+  obj.setProperty(runtime, "start", seg.start);
+  obj.setProperty(runtime, "end", seg.end);
+  
+  std::string segText;
+  for(auto& w : seg.words) segText += w.content;
+  obj.setProperty(runtime, "text", String::createFromUtf8(runtime, segText));
+
+  obj.setProperty(runtime, "avg_logprob", seg.avgLogprob);
+  obj.setProperty(runtime, "compression_ratio", seg.compressionRatio);
+  obj.setProperty(runtime, "temperature", seg.temperature);
+  obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
+
+  jsi::Array tokensAry(runtime, seg.tokens.size());
+  for (size_t i = 0; i < seg.tokens.size(); ++i) {
+    tokensAry.setValueAtIndex(runtime, i, static_cast<double>(seg.tokens[i]));
   }
-  return jsiSegments;
+  obj.setProperty(runtime, "tokens", tokensAry);
+
+  return obj;
 }
 
+
+
 } // namespace rnexecutorch::jsi_conversion
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 64c63e518..c2e8090a2 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -97,6 +97,9 @@ ASR::generateWithFallback(std::span<float> waveform,
                           const DecodingOptions &options) const {
   std::vector<float> temperatures = {0.0f, 0.2f, 0.4f, 0.6f, 0.8f, 1.0f};
   std::vector<uint64_t> bestTokens;
+  float bestAvgLogProb = -std::numeric_limits<float>::infinity();
+  float bestCompressionRatio = 0.0f;
+  float bestTemperature = 0.0f;
 
   for (auto t : temperatures) {
     auto [tokens, scores] = this->generate(waveform, t, options);
@@ -109,18 +112,33 @@ ASR::generateWithFallback(std::span<float> waveform,
     const std::string text = this->tokenizer->decode(tokens, true);
     const float compressionRatio = this->getCompressionRatio(text);
 
+    // Naive heuristic update - ensure we capture the stats of the chosen path
     if (avgLogProb >= -1.0f && compressionRatio < 2.4f) {
       bestTokens = std::move(tokens);
+      bestAvgLogProb = avgLogProb;
+      bestCompressionRatio = compressionRatio;
+      bestTemperature = t;
       break;
     }
+    
+    // Fallback logic (simplify for brevity: keep last if none pass)
+    if (t == temperatures.back() && bestTokens.empty()) {
+        bestTokens = std::move(tokens);
+        bestAvgLogProb = avgLogProb;
+        bestCompressionRatio = compressionRatio;
+        bestTemperature = t;
+    }
   }
 
-  return this->calculateWordLevelTimestamps(bestTokens, waveform);
+  // Pass metadata to calculation
+  return this->calculateWordLevelTimestamps(bestTokens, waveform, bestAvgLogProb, bestTemperature, bestCompressionRatio);
 }
 
 std::vector<Segment>
 ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
-                                  const std::span<const float> waveform) const {
+                                  const std::span<const float> waveform,
+                                  float avgLogProb, float temperature, float compressionRatio) const {
+
   const size_t generatedTokensSize = generatedTokens.size();
   if (generatedTokensSize < 2 ||
       generatedTokens[generatedTokensSize - 1] !=
@@ -153,10 +171,20 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
   const uint64_t end = generatedTokens[generatedTokensSize - 2];
   auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
 
-  if (words.size()) {
-    segments.emplace_back(std::move(words), 0.0);
+  Segment seg;
+  seg.words = std::move(words);
+  seg.tokens = tokens;
+  seg.avgLogprob = avgLogProb;
+  seg.temperature = temperature;
+  seg.compressionRatio = compressionRatio;
+  
+  if (!seg.words.empty()) {
+      seg.start = seg.words.front().start;
+      seg.end = seg.words.back().end;
   }
 
+  segments.push_back(std::move(seg));
+
   float scalingFactor =
       static_cast<float>(waveform.size()) /
       (ASR::kSamplingRate * (end - this->timestampBeginToken) *
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
index c351ddc55..5126f21a1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
@@ -6,10 +6,12 @@
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct DecodingOptions {
-  explicit DecodingOptions(const std::string &language)
-      : language(language.empty() ? std::nullopt : std::optional(language)) {}
+  explicit DecodingOptions(const std::string &language, bool verbose = false)
+      : language(language.empty() ? std::nullopt : std::optional(language)),
+        verbose(verbose) {}
 
   std::optional<std::string> language;
+  bool verbose;
 };
 
-} // namespace rnexecutorch::models::speech_to_text::types
+} // namespace rnexecutorch::models::speech_to_text::types
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
index 5b8368fe4..3570eb127 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
@@ -1,14 +1,19 @@
 #pragma once
 
 #include <vector>
-
 #include "Word.h"
 
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct Segment {
   std::vector<Word> words;
+  std::vector<int32_t> tokens; // Raw token IDs
+  float start;
+  float end;
+  float avgLogprob;
   float noSpeechProbability;
+  float temperature;
+  float compressionRatio;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
new file mode 100644
index 000000000..ca9c3e573
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <string>
+#include <vector>
+#include "Segment.h"
+
+namespace rnexecutorch::models::speech_to_text::types {
+
+struct TranscriptionResult {
+  std::string text;
+  std::string task = "transcribe";
+  std::string language;
+  double duration = 0.0;
+  std::vector<Segment> segments; // Populated only if verbose=true
+};
+
+} // namespace rnexecutorch::models::speech_to_text::types
\ No newline at end of file
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index cfabb2313..3166d9648 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -195,7 +195,28 @@ export type SpeechToTextLanguage =
  */
 export interface DecodingOptions {
   language?: SpeechToTextLanguage;
-  enableTimestamps?: boolean;
+  verbose?: boolean;
+}
+
+export interface TranscriptionSegment {
+  id: number;
+  seek: number;
+  start: number;
+  end: number;
+  text: string;
+  tokens: number[];
+  temperature: number;
+  avg_logprob: number;
+  compression_ratio: number;
+  no_speech_prob: number;
+}
+
+export interface TranscriptionResult {
+  task: string;
+  language: string;
+  duration: number;
+  text: string;
+  segments?: TranscriptionSegment[]; // Present if verbose=true
 }
 
 /**

From 2ce351df2e468dc1c64c8e2323329d819eb916be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Thu, 5 Feb 2026 12:13:33 +0100
Subject: [PATCH 16/49] fix: Add corrections (still not working version)

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  86 +++++-
 .../host_objects/JsiConversions.h             |  84 +++---
 .../host_objects/ModelHostObject.h            |   5 -
 .../models/speech_to_text/SpeechToText.cpp    | 280 +++++++++++++-----
 .../models/speech_to_text/SpeechToText.h      |   7 +-
 .../models/speech_to_text/asr/ASR.cpp         |  49 ++-
 .../models/speech_to_text/asr/ASR.h           |   8 +-
 .../SpeechToTextModule.ts                     |  84 ++++--
 .../react-native-executorch/src/types/stt.ts  |   2 +-
 9 files changed, 414 insertions(+), 191 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index b8f8cc63f..0a8a0c0c9 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -14,7 +14,7 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  Word,
+  TranscriptionResult, // Changed from Word
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
@@ -33,10 +33,17 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState<string | Word[]>('');
+  // State now holds the new TranscriptionResult object or null
+  const [transcription, setTranscription] =
+    useState<TranscriptionResult | null>(null);
 
-  const [enableTimestamps, setEnableTimestamps] = useState(false);
+  // State for live streaming results
+  const [liveResult, setLiveResult] = useState<{
+    committed: TranscriptionResult;
+    nonCommitted: TranscriptionResult;
+  } | null>(null);
 
+  const [enableTimestamps, setEnableTimestamps] = useState(false);
   const [audioURL, setAudioURL] = useState('');
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
@@ -72,6 +79,22 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       return destination;
     }
   }
+  // Updated helper to handle TranscriptionResult structure
+  const getText = (data: TranscriptionResult | undefined | null) => {
+    if (!data) return '';
+
+    // If verbose mode was on, we have segments
+    if (data.segments && data.segments.length > 0) {
+      return data.segments
+        .map(
+          (s) => `${s.text} (${s.start.toFixed(2)}s - ${s.end.toFixed(2)}s)\n`
+        )
+        .join('');
+    }
+
+    // Otherwise just return the full text
+    return data.text || '';
+  };
 
   const handleTranscribeFromURL = async () => {
     if (!audioURL.trim()) {
@@ -80,6 +103,14 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     }
 
     const uri = await getAudioFile(audioURL);
+    // Reset previous states
+    setTranscription(null);
+    setLiveResult(null);
+
+    const { uri } = await FileSystem.downloadAsync(
+      audioURL,
+      FileSystem.cacheDirectory + 'audio_file'
+    );
 
     const audioContext = new AudioContext({ sampleRate: 16000 });
 
@@ -87,8 +118,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
+      // API CHANGE: enableTimestamps -> verbose
       const result = await model.transcribe(audioBuffer, {
-        enableTimestamps: enableTimestamps as any,
+        verbose: enableTimestamps,
       });
       setTranscription(result);
     } catch (error) {
@@ -100,7 +132,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
   const handleStartTranscribeFromMicrophone = async () => {
     setLiveTranscribing(true);
-    setTranscription(enableTimestamps ? [] : '');
+    setTranscription(null);
+    setLiveResult({
+      committed: { text: '' },
+      nonCommitted: { text: '' },
+    });
 
     recorder.onAudioReady(({ buffer }) => {
       model.streamInsert(buffer.getChannelData(0));
@@ -108,7 +144,14 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.start();
 
     try {
-      await model.stream({ enableTimestamps: enableTimestamps });
+      // API CHANGE: Stream is now an AsyncGenerator
+      // API CHANGE: enableTimestamps -> verbose
+      const streamIter = model.stream({ verbose: enableTimestamps });
+
+      for await (const { committed, nonCommitted } of streamIter) {
+        if (!liveTranscribing) break; // Safety check
+        setLiveResult({ committed, nonCommitted });
+      }
     } catch (error) {
       console.error('Error during live transcription:', error);
     }
@@ -119,6 +162,18 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model.streamStop();
     console.log('Live transcription stopped');
     setLiveTranscribing(false);
+
+    // Move live result to final transcription state
+    if (liveResult) {
+      setTranscription({
+        text: liveResult.committed.text + liveResult.nonCommitted.text,
+        segments: [
+          ...(liveResult.committed.segments || []),
+          ...(liveResult.nonCommitted.segments || []),
+        ],
+      });
+      setLiveResult(null);
+    }
   };
 
   const getModelStatus = () => {
@@ -131,12 +186,14 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  const hasResult = transcription.length > 0;
-
-  const displayedText = hasResult
-    ? getText(transcription)
-    : getText(model.committedTranscription) +
-      getText(model.nonCommittedTranscription);
+  // Determine what text to display
+  let displayedText = '';
+  if (liveTranscribing && liveResult) {
+    displayedText =
+      getText(liveResult.committed) + getText(liveResult.nonCommitted);
+  } else if (transcription) {
+    displayedText = getText(transcription);
+  }
 
   return (
     <SafeAreaProvider>
@@ -159,12 +216,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
           </View>
 
           <View style={styles.toggleContainer}>
-            <Text style={styles.toggleLabel}>Enable Timestamps</Text>
+            <Text style={styles.toggleLabel}>Enable Timestamps (Verbose)</Text>
             <Switch
               value={enableTimestamps}
               onValueChange={(val) => {
                 setEnableTimestamps(val);
-                setTranscription(val ? [] : '');
+                setTranscription(null);
+                setLiveResult(null);
               }}
               trackColor={{ false: '#767577', true: '#0f186e' }}
               thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index b2b888a88..f9241d282 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -18,10 +18,12 @@
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
+#include <rnexecutorch/models/speech_to_text/types/Segment.h>
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 #include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
-using rnexecutorch::models::speech_to_text::types::Word;
+using namespace rnexecutorch::models::speech_to_text::types;
 
 namespace rnexecutorch::jsi_conversion {
 
@@ -68,17 +70,16 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
 template <>
 inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
   jsi::Object obj = val.asObject(runtime);
-  
-  std::string content = getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
-  
+
+  std::string content =
+      getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
+
   double start = obj.getProperty(runtime, "start").asNumber();
   double end = obj.getProperty(runtime, "end").asNumber();
 
-  return Word{
-      .content = std::move(content),
-      .start = static_cast<float>(start),
-      .end = static_cast<float>(end)
-  };
+  return Word{.content = std::move(content),
+              .start = static_cast<float>(start),
+              .end = static_cast<float>(end)};
 }
 
 template <>
@@ -317,13 +318,15 @@ inline jsi::Value getJsiValue(std::shared_ptr<jsi::Object> valuePtr,
 
 inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
-  obj.setProperty(runtime, "word", jsi::String::createFromUtf8(runtime, word.content));
+  obj.setProperty(runtime, "word",
+                  jsi::String::createFromUtf8(runtime, word.content));
   obj.setProperty(runtime, "start", static_cast<double>(word.start));
   obj.setProperty(runtime, "end", static_cast<double>(word.end));
   return obj;
 }
 
-inline jsi::Value getJsiValue(const std::vector<Word> &vec, jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(const std::vector<Word> &vec,
+                              jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
   for (size_t i = 0; i < vec.size(); ++i) {
     array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
@@ -504,7 +507,8 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 }
 
 // inline jsi::Value
-// getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
+// getJsiValue(const
+// std::vector<models::voice_activity_detection::types::Segment>
 //                 &speechSegments,
 //             jsi::Runtime &runtime) {
 //   auto jsiSegments = jsi::Array(runtime, speechSegments.size());
@@ -518,38 +522,17 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
 //   return jsiSegments;
 // }
 
-inline jsi::Value getJsiValue(const TranscriptionResult &result, bool verbose, jsi::Runtime &runtime) {
-  Object obj(runtime);
-  obj.setProperty(runtime, "text", String::createFromUtf8(runtime, result.text));
-
-  // If not verbose, return object with just "text"
-  if (!verbose) {
-    return obj; 
-  }
-
-  obj.setProperty(runtime, "task", String::createFromUtf8(runtime, result.task));
-  obj.setProperty(runtime, "language", String::createFromUtf8(runtime, result.language));
-  obj.setProperty(runtime, "duration", result.duration);
-
-  jsi::Array segmentsAry(runtime, result.segments.size());
-  for (size_t i = 0; i < result.segments.size(); ++i) {
-    segmentsAry.setValueAtIndex(runtime, i, getJsiValue(runtime, result.segments[i], i));
-  }
-  obj.setProperty(runtime, "segments", segmentsAry);
-
-  return obj;
-}
-
-inline jsi::Value getJsiValue(const Segment &seg, int id, jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
-  obj.setProperty(runtime, "id", id);
   obj.setProperty(runtime, "seek", 0); // Simplified
   obj.setProperty(runtime, "start", seg.start);
   obj.setProperty(runtime, "end", seg.end);
-  
+
   std::string segText;
-  for(auto& w : seg.words) segText += w.content;
-  obj.setProperty(runtime, "text", String::createFromUtf8(runtime, segText));
+  for (auto &w : seg.words)
+    segText += w.content;
+  obj.setProperty(runtime, "text",
+                  jsi::String::createFromUtf8(runtime, segText));
 
   obj.setProperty(runtime, "avg_logprob", seg.avgLogprob);
   obj.setProperty(runtime, "compression_ratio", seg.compressionRatio);
@@ -565,6 +548,29 @@ inline jsi::Value getJsiValue(const Segment &seg, int id, jsi::Runtime &runtime)
   return obj;
 }
 
+inline jsi::Value getJsiValue(const TranscriptionResult &result,
+                              jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "text",
+                  jsi::String::createFromUtf8(runtime, result.text));
+
+  if (!result.segments.empty() || !result.language.empty()) {
+    obj.setProperty(runtime, "task",
+                    jsi::String::createFromUtf8(runtime, result.task));
+    if (!result.language.empty()) {
+      obj.setProperty(runtime, "language",
+                      jsi::String::createFromUtf8(runtime, result.language));
+    }
+    obj.setProperty(runtime, "duration", result.duration);
 
+    jsi::Array segmentsAry(runtime, result.segments.size());
+    for (size_t i = 0; i < result.segments.size(); ++i) {
+      segmentsAry.setValueAtIndex(runtime, i,
+                                  getJsiValue(result.segments[i], runtime));
+    }
+    obj.setProperty(runtime, "segments", segmentsAry);
+  }
 
+  return obj;
+}
 } // namespace rnexecutorch::jsi_conversion
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 50797417e..a1ce8e8e8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -72,11 +72,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
                                        promiseHostFunction<&Model::transcribe>,
                                        "transcribe"));
 
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                              promiseHostFunction<&Model::transcribeStringOnly>,
-                              "transcribeStringOnly"));
-
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::stream>,
                                        "stream"));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 04b242454..868fb3f45 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -3,6 +3,7 @@
 #include "SpeechToText.h"
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 
 namespace rnexecutorch::models::speech_to_text {
 
@@ -43,122 +44,237 @@ SpeechToText::decode(std::span<uint64_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
-                                           std::string languageOption) const {
-  std::vector<Segment> segments =
-      this->asr->transcribe(waveform, DecodingOptions(languageOption));
-  std::vector<Word> transcription;
-
-  size_t transcriptionLength = 0;
-  for (auto &segment : segments) {
-    transcriptionLength += segment.words.size();
+// std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
+//                                            std::string languageOption) const
+//                                            {
+//   std::vector<Segment> segments =
+//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
+//   std::vector<Word> transcription;
+
+//   size_t transcriptionLength = 0;
+//   for (auto &segment : segments) {
+//     transcriptionLength += segment.words.size();
+//   }
+
+//   transcription.reserve(segments.size());
+
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcription.push_back(word);
+//     }
+//   }
+
+//   return transcription;
+// }
+
+// std::vector<char>
+// SpeechToText::transcribeStringOnly(std::span<float> waveform,
+//                                    std::string languageOption) const {
+//   std::vector<Segment> segments =
+//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
+//   std::string transcription;
+
+//   size_t transcriptionLength = 0;
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcriptionLength += word.content.size();
+//     }
+//   }
+//   transcription.reserve(transcriptionLength);
+
+//   for (auto &segment : segments) {
+//     for (auto &word : segment.words) {
+//       transcription += word.content;
+//     }
+//   }
+
+//   return {transcription.begin(), transcription.end()};
+// }
+
+// std::vector<char> mergeWordsToString(const std::vector<Word> &words) {
+//   std::string result;
+//   size_t totalLength = 0;
+
+//   for (const auto &word : words) {
+//     totalLength += word.content.size();
+//   }
+//   result.reserve(totalLength);
+
+//   for (const auto &word : words) {
+//     result += word.content;
+//   }
+
+//   return {result.begin(), result.end()};
+// }
+
+TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
+                                             std::string languageOption,
+                                             bool verbose) const {
+  DecodingOptions options(languageOption, verbose);
+  std::vector<Segment> segments = this->asr->transcribe(waveform, options);
+
+  std::string fullText;
+  for (const auto &segment : segments) {
+    for (const auto &word : segment.words)
+      fullText += word.content;
   }
 
-  transcription.reserve(segments.size());
+  TranscriptionResult result;
+  result.text = fullText;
 
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcription.push_back(word);
-    }
+  if (verbose) {
+    result.language = languageOption.empty() ? "english" : languageOption;
+    result.duration = static_cast<double>(waveform.size()) / 16000.0;
+    result.segments = std::move(segments);
   }
 
-  return transcription;
+  return result;
 }
 
-std::vector<char>
-SpeechToText::transcribeStringOnly(std::span<float> waveform,
-                                   std::string languageOption) const {
-  std::vector<Segment> segments =
-      this->asr->transcribe(waveform, DecodingOptions(languageOption));
-  std::string transcription;
-
-  size_t transcriptionLength = 0;
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcriptionLength += word.content.size();
-    }
-  }
-  transcription.reserve(transcriptionLength);
+size_t SpeechToText::getMemoryLowerBound() const noexcept {
+  return this->encoder->getMemoryLowerBound() +
+         this->decoder->getMemoryLowerBound();
+}
 
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcription += word.content;
-    }
+// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
+//                           std::string languageOption, bool enableTimestamps)
+//                           {
+//   if (this->isStreaming) {
+//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
+//                             "Streaming is already in progress!");
+//   }
+
+//   auto nativeCallback = [this, callback](const auto &committedVec,
+//                                          const auto &nonCommittedVec,
+//                                          bool isDone) {
+//     this->callInvoker->invokeAsync(
+//         [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) {
+//           jsi::Value committedJsi =
+//               rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt);
+//           jsi::Value nonCommittedJsi =
+//               rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt);
+
+//           callback->call(rt, std::move(committedJsi),
+//                          std::move(nonCommittedJsi), jsi::Value(isDone));
+//         });
+//   };
+
+//   this->isStreaming = true;
+//   while (this->isStreaming) {
+//     if (!this->readyToProcess ||
+//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
+//         {
+//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
+//       continue;
+//     }
+//     ProcessResult res =
+//         this->processor->processIter(DecodingOptions(languageOption));
+
+//     if (enableTimestamps) {
+//       nativeCallback(res.committed, res.nonCommitted, false);
+//     } else {
+//       nativeCallback(mergeWordsToString(res.committed),
+//                      mergeWordsToString(res.nonCommitted), false);
+//     }
+//     this->readyToProcess = false;
+//   }
+
+//   std::vector<Word> committed = this->processor->finish();
+
+//   if (enableTimestamps) {
+//     nativeCallback(committed, std::vector<Word>{}, true);
+//   } else {
+//     nativeCallback(mergeWordsToString(committed), std::vector<char>(), true);
+//   }
+
+//   this->resetStreamState();
+// }
+
+namespace {
+// Helper to convert a list of Words (from streaming) into the API Result format
+TranscriptionResult wordsToResult(const std::vector<Word> &words,
+                                  const std::string &language, bool verbose) {
+  TranscriptionResult res;
+  res.language = language;
+
+  // 1. Flatten text
+  std::string fullText;
+  for (const auto &w : words) {
+    fullText += w.content;
   }
+  res.text = fullText;
 
-  return {transcription.begin(), transcription.end()};
-}
-
-std::vector<char> mergeWordsToString(const std::vector<Word> &words) {
-  std::string result;
-  size_t totalLength = 0;
+  // 2. Build Verbose Segment
+  // Since OnlineASRProcessor only gives us Words, we create a single
+  // "Segment" containing all these words for the current chunk.
+  if (verbose && !words.empty()) {
+    Segment seg;
+    seg.start = words.front().start;
+    seg.end = words.back().end;
+    seg.words = words;
 
-  for (const auto &word : words) {
-    totalLength += word.content.size();
-  }
-  result.reserve(totalLength);
+    // Note: 'tokens', 'avgLogprob', etc. are missing in the 'Word' struct,
+    // so they will remain empty/default here.
 
-  for (const auto &word : words) {
-    result += word.content;
+    res.segments.push_back(std::move(seg));
   }
 
-  return {result.begin(), result.end()};
-}
-
-size_t SpeechToText::getMemoryLowerBound() const noexcept {
-  return this->encoder->getMemoryLowerBound() +
-         this->decoder->getMemoryLowerBound();
+  return res;
 }
+} // namespace
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption, bool enableTimestamps) {
-  if (this->isStreaming) {
-    throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
-                            "Streaming is already in progress!");
+                          std::string languageOption, bool verbose) {
+  if (this->isStreaming) { /* error... */
   }
 
-  auto nativeCallback = [this, callback](const auto &committedVec,
-                                         const auto &nonCommittedVec,
-                                         bool isDone) {
+  // Lambda that constructs the C++ structs (thread-safe, no JSI here)
+  auto nativeCallback = [this, callback,
+                         verbose](const TranscriptionResult &committed,
+                                  const TranscriptionResult &nonCommitted,
+                                  bool isDone) {
+    // This moves execution to the JS thread
     this->callInvoker->invokeAsync(
-        [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) {
-          jsi::Value committedJsi =
-              rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt);
-          jsi::Value nonCommittedJsi =
-              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt);
-
-          callback->call(rt, std::move(committedJsi),
-                         std::move(nonCommittedJsi), jsi::Value(isDone));
+        [callback, committed, nonCommitted, isDone, verbose](jsi::Runtime &rt) {
+          jsi::Value jsiCommitted =
+              rnexecutorch::jsi_conversion::getJsiValue(committed, rt);
+          jsi::Value jsiNonCommitted =
+              rnexecutorch::jsi_conversion::getJsiValue(nonCommitted, rt);
+
+          callback->call(rt, std::move(jsiCommitted),
+                         std::move(jsiNonCommitted), jsi::Value(isDone));
         });
   };
 
   this->isStreaming = true;
+  DecodingOptions options(languageOption, verbose);
+
   while (this->isStreaming) {
     if (!this->readyToProcess ||
         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) {
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       continue;
     }
-    ProcessResult res =
-        this->processor->processIter(DecodingOptions(languageOption));
-
-    if (enableTimestamps) {
-      nativeCallback(res.committed, res.nonCommitted, false);
-    } else {
-      nativeCallback(mergeWordsToString(res.committed),
-                     mergeWordsToString(res.nonCommitted), false);
-    }
-    this->readyToProcess = false;
-  }
 
-  std::vector<Word> committed = this->processor->finish();
+    // 1. Get the Vector of Words
+    ProcessResult res = this->processor->processIter(options);
 
-  if (enableTimestamps) {
-    nativeCallback(committed, std::vector<Word>{}, true);
-  } else {
-    nativeCallback(mergeWordsToString(committed), std::vector<char>(), true);
+    // 2. Convert Vectors to TranscriptionResult structs
+    TranscriptionResult cRes =
+        wordsToResult(res.committed, languageOption, verbose);
+    TranscriptionResult ncRes =
+        wordsToResult(res.nonCommitted, languageOption, verbose);
+
+    // 3. Pass to callback
+    nativeCallback(cRes, ncRes, false);
+    this->readyToProcess = false;
   }
 
+  std::vector<Word> finalWords = this->processor->finish();
+  TranscriptionResult finalRes =
+      wordsToResult(finalWords, languageOption, verbose);
+
+  nativeCallback(finalRes, {}, true);
   this->resetStreamState();
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index 8b525cc2d..b1f937dee 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 #include <span>
 #include <string>
 #include <vector>
@@ -23,8 +24,10 @@ class SpeechToText {
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<uint64_t> tokens, std::span<float> encoderOutput) const;
-  [[nodiscard("Registered non-void function")]] std::vector<char>
-  transcribe(std::span<float> waveform, std::string languageOption) const;
+  [[nodiscard("Registered non-void function")]]
+  TranscriptionResult transcribe(std::span<float> waveform,
+                                 std::string languageOption,
+                                 bool verbose) const;
 
   [[nodiscard("Registered non-void function")]]
   std::vector<char> transcribeStringOnly(std::span<float> waveform,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index c2e8090a2..08f234119 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -120,25 +120,27 @@ ASR::generateWithFallback(std::span<float> waveform,
       bestTemperature = t;
       break;
     }
-    
+
     // Fallback logic (simplify for brevity: keep last if none pass)
     if (t == temperatures.back() && bestTokens.empty()) {
-        bestTokens = std::move(tokens);
-        bestAvgLogProb = avgLogProb;
-        bestCompressionRatio = compressionRatio;
-        bestTemperature = t;
+      bestTokens = std::move(tokens);
+      bestAvgLogProb = avgLogProb;
+      bestCompressionRatio = compressionRatio;
+      bestTemperature = t;
     }
   }
 
   // Pass metadata to calculation
-  return this->calculateWordLevelTimestamps(bestTokens, waveform, bestAvgLogProb, bestTemperature, bestCompressionRatio);
+  return this->calculateWordLevelTimestamps(bestTokens, waveform,
+                                            bestAvgLogProb, bestTemperature,
+                                            bestCompressionRatio);
 }
 
 std::vector<Segment>
 ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
                                   const std::span<const float> waveform,
-                                  float avgLogProb, float temperature, float compressionRatio) const {
-
+                                  float avgLogProb, float temperature,
+                                  float compressionRatio) const {
   const size_t generatedTokensSize = generatedTokens.size();
   if (generatedTokensSize < 2 ||
       generatedTokens[generatedTokensSize - 1] !=
@@ -160,7 +162,25 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
       const uint64_t end = generatedTokens[i - 1];
       auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
       if (words.size()) {
-        segments.emplace_back(std::move(words), 0.0);
+        // segments.emplace_back(std::move(words), 0.0);
+        Segment seg;
+        seg.words = std::move(words);
+        // Initialize new fields with defaults or passed values
+        seg.tokens = {}; // Empty for word-level approximation
+        seg.avgLogprob = avgLogProb;
+        seg.temperature = temperature;
+        seg.compressionRatio = compressionRatio;
+        seg.noSpeechProbability = 0.0;
+
+        if (!seg.words.empty()) {
+          seg.start = seg.words.front().start;
+          seg.end = seg.words.back().end;
+        } else {
+          seg.start = 0.0;
+          seg.end = 0.0;
+        }
+
+        segments.push_back(std::move(seg));
       }
       tokens.clear();
       prevTimestamp = generatedTokens[i];
@@ -177,10 +197,10 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
   seg.avgLogprob = avgLogProb;
   seg.temperature = temperature;
   seg.compressionRatio = compressionRatio;
-  
+
   if (!seg.words.empty()) {
-      seg.start = seg.words.front().start;
-      seg.end = seg.words.back().end;
+    seg.start = seg.words.front().start;
+    seg.end = seg.words.back().end;
   }
 
   segments.push_back(std::move(seg));
@@ -244,7 +264,8 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
   while (std::cmp_less(seek * ASR::kSamplingRate, waveform.size())) {
     int32_t start = seek * ASR::kSamplingRate;
     const auto end = std::min<int32_t>(
-        (seek + ASR::kChunkSize) * ASR::kSamplingRate, waveform.size());
+        static_cast<int32_t>((seek + ASR::kChunkSize) * ASR::kSamplingRate),
+        static_cast<int32_t>(waveform.size()));
     auto chunk = waveform.subspan(start, end - start);
 
     if (std::cmp_less(chunk.size(), ASR::kMinChunkSamples)) {
@@ -318,7 +339,7 @@ std::vector<float> ASR::decode(std::span<const uint64_t> tokens,
   }
 
   const auto logitsTensor = decoderResult.get().at(0).toTensor();
-  const int32_t outputNumel = logitsTensor.numel();
+  const int32_t outputNumel = static_cast<int32_t>(logitsTensor.numel());
 
   const size_t innerDim = logitsTensor.size(1);
   const size_t dictSize = logitsTensor.size(2);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
index 8cdbd5522..16a2f45e6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -51,9 +51,11 @@ class ASR {
   std::vector<types::Segment>
   generateWithFallback(std::span<float> waveform,
                        const types::DecodingOptions &options) const;
-  std::vector<types::Segment>
-  calculateWordLevelTimestamps(std::span<const uint64_t> tokens,
-                               std::span<const float> waveform) const;
+  std::vector<Segment>
+  calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
+                               const std::span<const float> waveform,
+                               float avgLogProb, float temperature,
+                               float compressionRatio) const;
   std::vector<types::Word>
   estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
                                     uint64_t start, uint64_t end) const;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 5891e4cd5..6eb3bb36c 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -1,4 +1,8 @@
-import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+import {
+  DecodingOptions,
+  SpeechToTextModelConfig,
+  TranscriptionResult,
+} from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -102,15 +106,20 @@ export class SpeechToTextModule {
    * @returns The transcription string.
    */
   public async transcribe(
+<<<<<<< HEAD
     waveform: Float32Array,
+=======
+    waveform: Float32Array | number[],
+>>>>>>> 6289290f (fix: Add corrections (still not working version))
     options: DecodingOptions = {}
-  ): Promise<string | Word[]> {
+  ): Promise<TranscriptionResult> {
     this.validateOptions(options);
     const transcriptionBytes = await this.nativeModule.transcribe(
       waveform,
       options.language || ''
     );
 
+<<<<<<< HEAD
     return transcriptionBytes;
   }
 
@@ -131,14 +140,30 @@ export class SpeechToTextModule {
   ): AsyncGenerator<{
     committed: string | Word[];
     nonCommitted: string | Word[];
+=======
+    return await this.nativeModule.transcribe(
+      waveform,
+      options.language || '',
+      !!options.verbose
+    );
+  }
+
+  public async *stream(
+    options: DecodingOptions = {}
+  ): AsyncGenerator<{
+    committed: TranscriptionResult;
+    nonCommitted: TranscriptionResult;
+>>>>>>> 6289290f (fix: Add corrections (still not working version))
   }> {
     this.validateOptions(options);
 
-    const enableTimestamps = options.enableTimestamps === true;
+    // Prepare arguments for native call
+    const verbose = !!options.verbose;
+    const language = options.language || '';
 
     const queue: {
-      committed: string | Word[];
-      nonCommitted: string | Word[];
+      committed: TranscriptionResult;
+      nonCommitted: TranscriptionResult;
     }[] = [];
 
     let waiter: (() => void) | null = null;
@@ -150,36 +175,32 @@ export class SpeechToTextModule {
       waiter = null;
     };
 
+    // Start the native streaming process in the background
     (async () => {
       try {
-        const callback = (
-          committed: any,
-          nonCommitted: any,
-          isDone: boolean
-        ) => {
-          if (!enableTimestamps) {
-            try {
-              queue.push({
-                committed: this.textDecoder.decode(new Uint8Array(committed)),
-                nonCommitted: this.textDecoder.decode(
-                  new Uint8Array(nonCommitted)
-                ),
-              });
-            } catch (err) {
-              Logger.error('[Stream Decode Error]', err);
-            }
-          } else {
-            queue.push({ committed, nonCommitted });
-          }
-
-          if (isDone) finished = true;
-          wake();
-        };
+        await this.nativeModule.stream(
+          (
+            committed: TranscriptionResult,
+            nonCommitted: TranscriptionResult,
+            isDone: boolean
+          ) => {
+            // The native module now returns ready-to-use JS objects via JSI.
+            // No TextDecoder or JSON parsing required.
+            queue.push({
+              committed,
+              nonCommitted,
+            });
 
-        const language = options.language || '';
-
-        await this.nativeModule.stream(callback, language, enableTimestamps);
+            if (isDone) {
+              finished = true;
+            }
+            wake();
+          },
+          language,
+          verbose
+        );
 
+        // Mark as finished when the native promise resolves (stream ends)
         finished = true;
         wake();
       } catch (e) {
@@ -189,6 +210,7 @@ export class SpeechToTextModule {
       }
     })();
 
+    // Consumer Loop
     while (true) {
       if (queue.length > 0) {
         yield queue.shift()!;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 3166d9648..1750263ca 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -212,7 +212,7 @@ export interface TranscriptionSegment {
 }
 
 export interface TranscriptionResult {
-  task: string;
+  task?: string;
   language: string;
   duration: number;
   text: string;

From c4c8fdc3015b71abd7c975fc8467f505e4e849e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Thu, 5 Feb 2026 17:22:13 +0100
Subject: [PATCH 17/49] feat: Add working word level timestamping that mimics
 OpenAI API

---
 .../components/VerboseTranscription.tsx       | 246 ++++++++++++++++++
 apps/speech/screens/SpeechToTextScreen.tsx    |  34 +--
 .../host_objects/JsiConversions.h             | 142 ++++++----
 .../models/speech_to_text/SpeechToText.h      |   6 +-
 .../models/speech_to_text/asr/ASR.cpp         |   3 +
 .../stream/OnlineASRProcessor.h               |   3 +-
 .../react-native-executorch/src/types/stt.ts  |   7 +
 7 files changed, 359 insertions(+), 82 deletions(-)
 create mode 100644 apps/speech/components/VerboseTranscription.tsx

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
new file mode 100644
index 000000000..9587999b1
--- /dev/null
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -0,0 +1,246 @@
+import React from 'react';
+import { View, Text, StyleSheet, Platform } from 'react-native';
+import { TranscriptionResult, Word } from 'react-native-executorch';
+import { TranscriptionSegment } from '../../../packages/react-native-executorch/src/types/stt';
+
+export const VerboseTranscription = ({
+  data,
+}: {
+  data: TranscriptionResult;
+}) => {
+  if (!data) return null;
+  console.log(data);
+  return (
+    <View style={styles.container}>
+      {/* Global Metadata */}
+      <View style={styles.metaContainer}>
+        <Text style={styles.label}>Full Text:</Text>
+        <Text style={styles.text}>{data.text}</Text>
+
+        <View style={styles.row}>
+          <Text style={styles.metaItem}>
+            Language: {data.language || 'N/A'}
+          </Text>
+          <Text style={styles.metaItem}>
+            Duration: {data.duration?.toFixed(2)}s
+          </Text>
+        </View>
+      </View>
+
+      <Text style={styles.sectionHeader}>
+        Segments ({data.segments?.length || 0})
+      </Text>
+
+      {/* Segments List */}
+      {data.segments?.map((seg: TranscriptionSegment, index: number) => (
+        <View key={index} style={styles.segmentCard}>
+          <View style={styles.segmentHeader}>
+            <Text style={styles.timeBadge}>
+              {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
+            </Text>
+            <Text style={styles.segmentId}>ID: {index}</Text>
+          </View>
+
+          <Text style={styles.segmentText}>"{seg.text}"</Text>
+
+          {seg.words && seg.words.length > 0 && (
+            <View style={styles.wordsContainer}>
+              <Text style={styles.statLabel}>Word Timestamps:</Text>
+              <View style={styles.wordsGrid}>
+                {seg.words.map((w: Word, wIdx: number) => (
+                  <View key={wIdx} style={styles.wordChip}>
+                    <Text style={styles.wordText}>{w.word.trim()}</Text>
+                    <Text style={styles.wordTime}>{w.start.toFixed(2)}s</Text>
+                  </View>
+                ))}
+              </View>
+            </View>
+          )}
+
+          {/* Verbose Statistics */}
+          <View style={styles.statsGrid}>
+            <View style={styles.statItem}>
+              <Text style={styles.statLabel}>Avg LogProb</Text>
+              <Text style={styles.statValue}>
+                {seg.avg_logprob?.toFixed(4)}
+              </Text>
+            </View>
+            <View style={styles.statItem}>
+              <Text style={styles.statLabel}>No Speech</Text>
+              <Text style={styles.statValue}>
+                {seg.no_speech_prob?.toFixed(4)}
+              </Text>
+            </View>
+            <View style={styles.statItem}>
+              <Text style={styles.statLabel}>Temp</Text>
+              <Text style={styles.statValue}>
+                {seg.temperature?.toFixed(2)}
+              </Text>
+            </View>
+            <View style={styles.statItem}>
+              <Text style={styles.statLabel}>Compr.</Text>
+              <Text style={styles.statValue}>
+                {seg.compression_ratio?.toFixed(2)}
+              </Text>
+            </View>
+          </View>
+
+          {/* Tokens (Optional: Remove if too noisy) */}
+          <View style={styles.tokensContainer}>
+            <Text style={styles.tokenLabel}>Tokens: </Text>
+            <Text style={styles.tokenList}>[{seg.tokens?.join(', ')}]</Text>
+          </View>
+        </View>
+      ))}
+    </View>
+  );
+};
+
+const styles = StyleSheet.create({
+  container: {
+    padding: 4,
+  },
+  metaContainer: {
+    marginBottom: 16,
+    padding: 12,
+    backgroundColor: '#f0f2f5',
+    borderRadius: 8,
+  },
+  label: {
+    fontWeight: 'bold',
+    color: '#0f186e',
+    marginBottom: 4,
+  },
+  text: {
+    fontSize: 16,
+    color: '#333',
+    marginBottom: 8,
+  },
+  row: {
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+    marginTop: 8,
+  },
+  metaItem: {
+    fontSize: 12,
+    color: '#666',
+    backgroundColor: '#e1e4e8',
+    paddingHorizontal: 8,
+    paddingVertical: 2,
+    borderRadius: 4,
+  },
+  sectionHeader: {
+    fontSize: 18,
+    fontWeight: 'bold',
+    color: '#0f186e',
+    marginBottom: 8,
+    marginTop: 8,
+  },
+  segmentCard: {
+    backgroundColor: '#fff',
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: '#e1e4e8',
+    marginBottom: 12,
+    padding: 12,
+    shadowColor: '#000',
+    shadowOffset: { width: 0, height: 1 },
+    shadowOpacity: 0.1,
+    shadowRadius: 2,
+    elevation: 2,
+  },
+  segmentHeader: {
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+    marginBottom: 8,
+  },
+  timeBadge: {
+    fontSize: 12,
+    fontWeight: 'bold',
+    color: '#fff',
+    backgroundColor: '#0f186e',
+    paddingHorizontal: 8,
+    paddingVertical: 2,
+    borderRadius: 12,
+    overflow: 'hidden',
+  },
+  segmentId: {
+    fontSize: 12,
+    color: '#888',
+  },
+  segmentText: {
+    fontSize: 15,
+    fontStyle: 'italic',
+    color: '#333',
+    marginBottom: 12,
+  },
+  statsGrid: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 8,
+    borderTopWidth: 1,
+    borderTopColor: '#f0f0f0',
+    paddingTop: 8,
+  },
+  statItem: {
+    flex: 1,
+    minWidth: '45%',
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+  },
+  statLabel: {
+    fontSize: 11,
+    color: '#888',
+  },
+  statValue: {
+    fontSize: 11,
+    fontWeight: '600',
+    color: '#444',
+  },
+  tokensContainer: {
+    marginTop: 8,
+    paddingTop: 8,
+    borderTopWidth: 1,
+    borderTopColor: '#f0f0f0',
+  },
+  tokenLabel: {
+    fontSize: 11,
+    fontWeight: 'bold',
+    color: '#666',
+  },
+  tokenList: {
+    fontSize: 10,
+    color: '#888',
+    fontFamily: Platform.OS === 'ios' ? 'Courier' : 'monospace',
+  },
+  wordsContainer: {
+    marginVertical: 8,
+    backgroundColor: '#f8f9fa',
+    padding: 8,
+    borderRadius: 6,
+  },
+  wordsGrid: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 6,
+    marginTop: 4,
+  },
+  wordChip: {
+    backgroundColor: '#ffffff',
+    borderWidth: 1,
+    borderColor: '#e1e4e8',
+    borderRadius: 4,
+    paddingHorizontal: 6,
+    paddingVertical: 2,
+    alignItems: 'center',
+  },
+  wordText: {
+    fontSize: 12,
+    color: '#333',
+  },
+  wordTime: {
+    fontSize: 9,
+    color: '#888',
+    marginTop: 1,
+  },
+});
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 0a8a0c0c9..1fbafa214 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -26,6 +26,8 @@ import * as FileSystem from 'expo-file-system/legacy';
 import SWMIcon from '../assets/swm_icon.svg';
 import DeviceInfo from 'react-native-device-info';
 
+import { VerboseTranscription } from '../components/VerboseTranscription';
+
 const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
@@ -79,22 +81,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       return destination;
     }
   }
-  // Updated helper to handle TranscriptionResult structure
-  const getText = (data: TranscriptionResult | undefined | null) => {
-    if (!data) return '';
-
-    // If verbose mode was on, we have segments
-    if (data.segments && data.segments.length > 0) {
-      return data.segments
-        .map(
-          (s) => `${s.text} (${s.start.toFixed(2)}s - ${s.end.toFixed(2)}s)\n`
-        )
-        .join('');
-    }
-
-    // Otherwise just return the full text
-    return data.text || '';
-  };
 
   const handleTranscribeFromURL = async () => {
     if (!audioURL.trim()) {
@@ -107,11 +93,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     setTranscription(null);
     setLiveResult(null);
 
-    const { uri } = await FileSystem.downloadAsync(
-      audioURL,
-      FileSystem.cacheDirectory + 'audio_file'
-    );
-
     const audioContext = new AudioContext({ sampleRate: 16000 });
 
     try {
@@ -186,15 +167,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // Determine what text to display
-  let displayedText = '';
-  if (liveTranscribing && liveResult) {
-    displayedText =
-      getText(liveResult.committed) + getText(liveResult.nonCommitted);
-  } else if (transcription) {
-    displayedText = getText(transcription);
-  }
-
   return (
     <SafeAreaProvider>
       <SafeAreaView style={styles.container}>
@@ -239,7 +211,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <Text>{displayedText}</Text>
+              <VerboseTranscription data={transcription} />
             </ScrollView>
           </View>
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index f9241d282..3bf9d701c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -20,15 +20,17 @@
 #include <rnexecutorch/models/ocr/Types.h>
 #include <rnexecutorch/models/speech_to_text/types/Segment.h>
 #include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
-#include <rnexecutorch/models/speech_to_text/types/Word.h>
+// #include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
-using namespace rnexecutorch::models::speech_to_text::types;
+// using namespace rnexecutorch::models::speech_to_text::types;
 
 namespace rnexecutorch::jsi_conversion {
 
 using namespace facebook;
 
+// #error "I AM READING THE CORRECT FILE"
+
 // Conversion from jsi to C++ types --------------------------------------------
 
 template <typename T> T getValue(const jsi::Value &val, jsi::Runtime &runtime);
@@ -67,20 +69,20 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
       val.asObject(runtime).asFunction(runtime));
 }
 
-template <>
-inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
-  jsi::Object obj = val.asObject(runtime);
+// template <>
+// inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
+//   jsi::Object obj = val.asObject(runtime);
 
-  std::string content =
-      getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
+//   std::string content =
+//       getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
 
-  double start = obj.getProperty(runtime, "start").asNumber();
-  double end = obj.getProperty(runtime, "end").asNumber();
+//   double start = obj.getProperty(runtime, "start").asNumber();
+//   double end = obj.getProperty(runtime, "end").asNumber();
 
-  return Word{.content = std::move(content),
-              .start = static_cast<float>(start),
-              .end = static_cast<float>(end)};
-}
+//   return Word{.content = std::move(content),
+//               .start = static_cast<float>(start),
+//               .end = static_cast<float>(end)};
+// }
 
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
@@ -316,23 +318,23 @@ inline jsi::Value getJsiValue(std::shared_ptr<jsi::Object> valuePtr,
   return std::move(*valuePtr);
 }
 
-inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
-  jsi::Object obj(runtime);
-  obj.setProperty(runtime, "word",
-                  jsi::String::createFromUtf8(runtime, word.content));
-  obj.setProperty(runtime, "start", static_cast<double>(word.start));
-  obj.setProperty(runtime, "end", static_cast<double>(word.end));
-  return obj;
-}
+// inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
+//   jsi::Object obj(runtime);
+//   obj.setProperty(runtime, "word",
+//                   jsi::String::createFromUtf8(runtime, word.content));
+//   obj.setProperty(runtime, "start", static_cast<double>(word.start));
+//   obj.setProperty(runtime, "end", static_cast<double>(word.end));
+//   return obj;
+// }
 
-inline jsi::Value getJsiValue(const std::vector<Word> &vec,
-                              jsi::Runtime &runtime) {
-  jsi::Array array(runtime, vec.size());
-  for (size_t i = 0; i < vec.size(); ++i) {
-    array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
-  }
-  return {runtime, array};
-}
+// inline jsi::Value getJsiValue(const std::vector<Word> &vec,
+//                               jsi::Runtime &runtime) {
+//   jsi::Array array(runtime, vec.size());
+//   for (size_t i = 0; i < vec.size(); ++i) {
+//     array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
+//   }
+//   return {runtime, array};
+// }
 
 inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
                               jsi::Runtime &runtime) {
@@ -506,30 +508,58 @@ getJsiValue(const std::vector<models::ocr::types::OCRDetection> &detections,
   return jsiDetections;
 }
 
-// inline jsi::Value
-// getJsiValue(const
-// std::vector<models::voice_activity_detection::types::Segment>
-//                 &speechSegments,
-//             jsi::Runtime &runtime) {
-//   auto jsiSegments = jsi::Array(runtime, speechSegments.size());
-//   for (size_t i = 0; i < speechSegments.size(); i++) {
-//     const auto &[start, end] = speechSegments[i];
-//     auto jsiSegmentObject = jsi::Object(runtime);
-//     jsiSegmentObject.setProperty(runtime, "start", static_cast<int>(start));
-//     jsiSegmentObject.setProperty(runtime, "end", static_cast<int>(end));
-//     jsiSegments.setValueAtIndex(runtime, i, jsiSegmentObject);
+inline jsi::Value
+getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
+                &speechSegments,
+            jsi::Runtime &runtime) {
+  auto jsiSegments = jsi::Array(runtime, speechSegments.size());
+  for (size_t i = 0; i < speechSegments.size(); i++) {
+    const auto &[start, end] = speechSegments[i];
+    auto jsiSegmentObject = jsi::Object(runtime);
+    jsiSegmentObject.setProperty(runtime, "start", static_cast<int>(start));
+    jsiSegmentObject.setProperty(runtime, "end", static_cast<int>(end));
+    jsiSegments.setValueAtIndex(runtime, i, jsiSegmentObject);
+  }
+  return jsiSegments;
+}
+
+// inline jsi::Value getJsiValue(const
+// rnexecutorch::models::speech_to_text::types::Segment &seg, jsi::Runtime
+// &runtime) {
+//   jsi::Object obj(runtime);
+//   obj.setProperty(runtime, "start", seg.start);
+//   obj.setProperty(runtime, "end", seg.end);
+
+//   std::string segText;
+//   for (auto &w : seg.words)
+//     segText += w.content;
+//   obj.setProperty(runtime, "text",
+//                   jsi::String::createFromUtf8(runtime, segText));
+
+//   obj.setProperty(runtime, "avg_logprob", seg.avgLogprob);
+//   obj.setProperty(runtime, "compression_ratio", seg.compressionRatio);
+//   obj.setProperty(runtime, "temperature", seg.temperature);
+//   obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
+
+//   jsi::Array tokensAry(runtime, seg.tokens.size());
+//   for (size_t i = 0; i < seg.tokens.size(); ++i) {
+//     tokensAry.setValueAtIndex(runtime, i,
+//     static_cast<double>(seg.tokens[i]));
 //   }
-//   return jsiSegments;
+//   obj.setProperty(runtime, "tokens", tokensAry);
+
+//   return obj;
 // }
 
-inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
+inline jsi::Value
+getJsiValue(const rnexecutorch::models::speech_to_text::types::Segment &seg,
+            jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
-  obj.setProperty(runtime, "seek", 0); // Simplified
   obj.setProperty(runtime, "start", seg.start);
   obj.setProperty(runtime, "end", seg.end);
 
   std::string segText;
-  for (auto &w : seg.words)
+  for (const auto &w : seg.words)
     segText += w.content;
   obj.setProperty(runtime, "text",
                   jsi::String::createFromUtf8(runtime, segText));
@@ -539,6 +569,22 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   obj.setProperty(runtime, "temperature", seg.temperature);
   obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
 
+  // --- NEW: EXPORT WORD TIMESTAMPS ---
+  jsi::Array wordsAry(runtime, seg.words.size());
+  for (size_t i = 0; i < seg.words.size(); ++i) {
+    jsi::Object wordObj(runtime);
+    wordObj.setProperty(
+        runtime, "word",
+        jsi::String::createFromUtf8(runtime, seg.words[i].content));
+    wordObj.setProperty(runtime, "start",
+                        static_cast<double>(seg.words[i].start));
+    wordObj.setProperty(runtime, "end", static_cast<double>(seg.words[i].end));
+
+    wordsAry.setValueAtIndex(runtime, i, wordObj);
+  }
+  obj.setProperty(runtime, "words", wordsAry);
+  // -----------------------------------
+
   jsi::Array tokensAry(runtime, seg.tokens.size());
   for (size_t i = 0; i < seg.tokens.size(); ++i) {
     tokensAry.setValueAtIndex(runtime, i, static_cast<double>(seg.tokens[i]));
@@ -548,8 +594,10 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   return obj;
 }
 
-inline jsi::Value getJsiValue(const TranscriptionResult &result,
-                              jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(
+    const rnexecutorch::models::speech_to_text::types::TranscriptionResult
+        &result,
+    jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
   obj.setProperty(runtime, "text",
                   jsi::String::createFromUtf8(runtime, result.text));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index b1f937dee..f9156040d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -25,9 +25,9 @@ class SpeechToText {
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<uint64_t> tokens, std::span<float> encoderOutput) const;
   [[nodiscard("Registered non-void function")]]
-  TranscriptionResult transcribe(std::span<float> waveform,
-                                 std::string languageOption,
-                                 bool verbose) const;
+  types::TranscriptionResult transcribe(std::span<float> waveform,
+                                        std::string languageOption,
+                                        bool verbose) const;
 
   [[nodiscard("Registered non-void function")]]
   std::vector<char> transcribeStringOnly(std::span<float> waveform,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 08f234119..0beccf021 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -284,6 +284,9 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
         w.start += seek;
         w.end += seek;
       }
+
+      seg.start += seek;
+      seg.end += seek;
     }
 
     seek = static_cast<int32_t>(segments.back().words.back().end);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index 3abaad3b6..98944bdbe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -3,6 +3,7 @@
 #include "rnexecutorch/models/speech_to_text/asr/ASR.h"
 #include "rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h"
 #include "rnexecutorch/models/speech_to_text/types/ProcessResult.h"
+#include "rnexecutorch/models/speech_to_text/types/Word.h"
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
@@ -12,7 +13,7 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  std::vector<Word> finish();
+  std::vector<types::Word> finish();
 
   std::vector<float> audioBuffer;
 
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 1750263ca..d568cb2da 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -198,12 +198,19 @@ export interface DecodingOptions {
   verbose?: boolean;
 }
 
+export interface Word {
+  word: string;
+  start: number;
+  end: number;
+}
+
 export interface TranscriptionSegment {
   id: number;
   seek: number;
   start: number;
   end: number;
   text: string;
+  words?: Word[];
   tokens: number[];
   temperature: number;
   avg_logprob: number;

From 78c2b8e93e40f86280b14cb7a1cc37e833312862 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 12:37:59 +0100
Subject: [PATCH 18/49] chore: Suppress pre-commit hook warnings

---
 apps/llm/app/voice_chat/index.tsx             |  55 ++++--
 .../components/VerboseTranscription.tsx       |   7 +-
 apps/speech/screens/SpeechToTextScreen.tsx    |  23 ++-
 .../useSpeechToText.ts                        | 184 ++++++++----------
 .../SpeechToTextModule.ts                     |  35 +---
 5 files changed, 148 insertions(+), 156 deletions(-)

diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
index 0bf4c9b30..02a919201 100644
--- a/apps/llm/app/voice_chat/index.tsx
+++ b/apps/llm/app/voice_chat/index.tsx
@@ -1,4 +1,4 @@
-import { useContext, useEffect, useRef, useState } from 'react';
+import { useContext, useEffect, useState } from 'react';
 import {
   Keyboard,
   KeyboardAvoidingView,
@@ -35,6 +35,9 @@ export default function VoiceChatScreenWrapper() {
 
 function VoiceChatScreen() {
   const [isRecording, setIsRecording] = useState(false);
+  // Local state to track the text while streaming
+  const [liveTranscription, setLiveTranscription] = useState('');
+
   const [recorder] = useState(
     () =>
       new AudioRecorder({
@@ -42,7 +45,7 @@ function VoiceChatScreen() {
         bufferLengthInSamples: 1600,
       })
   );
-  const messageRecorded = useRef<boolean>(false);
+
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   const llm = useLLM({ model: QWEN3_0_6B_QUANTIZED });
@@ -65,22 +68,40 @@ function VoiceChatScreen() {
 
   const handleRecordPress = async () => {
     if (isRecording) {
+      // STOP RECORDING
       setIsRecording(false);
       recorder.stop();
-      messageRecorded.current = true;
+      // This will cause the generator loop below to break
       speechToText.streamStop();
     } else {
+      // START RECORDING
       setIsRecording(true);
+      setLiveTranscription(''); // Reset previous text
+
       recorder.onAudioReady(({ buffer }) => {
         speechToText.streamInsert(buffer.getChannelData(0));
       });
       recorder.start();
-      const transcription = await speechToText.stream();
-      await llm.sendMessage(
-        typeof transcription === 'string'
-          ? transcription
-          : transcription.map((w) => w.word).join(' ')
-      );
+
+      let finalResult = '';
+
+      try {
+        // Iterate over the async generator
+        for await (const result of speechToText.stream()) {
+          // Combine committed and non-committed text for live preview
+          const text = result.committed.text + result.nonCommitted.text;
+          setLiveTranscription(text);
+          finalResult = text;
+        }
+      } catch (e) {
+        console.error('Streaming error:', e);
+      } finally {
+        // When the loop breaks (streamStop called), send to LLM
+        if (finalResult.trim().length > 0) {
+          await llm.sendMessage(finalResult);
+          setLiveTranscription(''); // Clear after sending
+        }
+      }
     }
   };
 
@@ -100,22 +121,19 @@ function VoiceChatScreen() {
           <SWMIcon width={45} height={45} />
           <Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
         </View>
-        {llm.messageHistory.length || speechToText.committedTranscription ? (
+
+        {/* Show history OR if we are currently recording/have text */}
+        {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
           <View style={styles.chatContainer}>
             <Messages
               chatHistory={
-                speechToText.isGenerating
+                // If we are recording, temporarily append the live user message
+                isRecording && liveTranscription.length > 0
                   ? [
                       ...llm.messageHistory,
                       {
                         role: 'user',
-                        content:
-                          typeof speechToText.committedTranscription ===
-                          'string'
-                            ? speechToText.committedTranscription
-                            : speechToText.committedTranscription
-                                .map((w) => w.word)
-                                .join(' '),
+                        content: liveTranscription,
                       },
                     ]
                   : llm.messageHistory
@@ -133,6 +151,7 @@ function VoiceChatScreen() {
             </Text>
           </View>
         )}
+
         <View style={styles.bottomContainer}>
           {DeviceInfo.isEmulatorSync() ? (
             <View style={styles.emulatorBox}>
diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 9587999b1..e90a90294 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -1,7 +1,10 @@
 import React from 'react';
 import { View, Text, StyleSheet, Platform } from 'react-native';
-import { TranscriptionResult, Word } from 'react-native-executorch';
-import { TranscriptionSegment } from '../../../packages/react-native-executorch/src/types/stt';
+import {
+  TranscriptionSegment,
+  TranscriptionResult,
+  Word,
+} from 'react-native-executorch';
 
 export const VerboseTranscription = ({
   data,
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 1fbafa214..1a3191ff9 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -14,7 +14,7 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  TranscriptionResult, // Changed from Word
+  TranscriptionResult,
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
@@ -115,8 +115,18 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     setLiveTranscribing(true);
     setTranscription(null);
     setLiveResult({
-      committed: { text: '' },
-      nonCommitted: { text: '' },
+      committed: {
+        text: '',
+        language: 'en',
+        duration: 0,
+        segments: [],
+      },
+      nonCommitted: {
+        text: '',
+        language: 'en',
+        duration: 0,
+        segments: [],
+      },
     });
 
     recorder.onAudioReady(({ buffer }) => {
@@ -152,6 +162,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
           ...(liveResult.committed.segments || []),
           ...(liveResult.nonCommitted.segments || []),
         ],
+        // Required fields derived from last known state
+        language: liveResult.committed.language || 'en',
+        duration:
+          (liveResult.committed.duration || 0) +
+          (liveResult.nonCommitted.duration || 0),
       });
       setLiveResult(null);
     }
@@ -211,7 +226,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <VerboseTranscription data={transcription} />
+              <VerboseTranscription data={transcription!} />
             </ScrollView>
           </View>
 
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 611ec3153..aa1ba7e87 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,9 +1,10 @@
-import { useEffect, useCallback, useState } from 'react';
+import { useEffect, useCallback, useState, useMemo } from 'react';
 import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
 import {
   DecodingOptions,
   SpeechToTextType,
   SpeechToTextProps,
+  TranscriptionResult,
 } from '../../types/stt';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -24,38 +25,42 @@ export const useSpeechToText = ({
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
 
-  const [modelInstance] = useState(() => new SpeechToTextModule());
-
-  const [committedTranscription, setCommittedTranscription] = useState<
-    string | Word[]
-  >('');
-  const [nonCommittedTranscription, setNonCommittedTranscription] = useState<
-    string | Word[]
-  >('');
+  // Use useMemo to ensure the module instance is stable
+  const moduleInstance = useMemo(() => new SpeechToTextModule(), []);
 
+  // 1. Load Model Effect
   useEffect(() => {
     if (preventLoad) return;
+    let isMounted = true;
+
     (async () => {
       setDownloadProgress(0);
       setError(null);
       try {
         setIsReady(false);
-        await modelInstance.load(
+        await moduleInstance.load(
           {
             isMultilingual: model.isMultilingual,
             encoderSource: model.encoderSource,
             decoderSource: model.decoderSource,
             tokenizerSource: model.tokenizerSource,
           },
-          setDownloadProgress
+          (progress) => {
+            if (isMounted) setDownloadProgress(progress);
+          }
         );
-        setIsReady(true);
+        if (isMounted) setIsReady(true);
       } catch (err) {
-        setError(parseUnknownError(err));
+        if (isMounted) setError(parseUnknownError(err));
       }
     })();
+
+    return () => {
+      isMounted = false;
+      moduleInstance.delete();
+    };
   }, [
-    modelInstance,
+    moduleInstance,
     model.isMultilingual,
     model.encoderSource,
     model.decoderSource,
@@ -63,120 +68,97 @@ export const useSpeechToText = ({
     preventLoad,
   ]);
 
-  const stateWrapper = useCallback(
-    <T extends (...args: any[]) => Promise<any>>(fn: T) =>
-      async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
-        if (!isReady)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
-          );
-        if (isGenerating)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating. Please wait until previous model run is complete.'
-          );
-        setIsGenerating(true);
-        try {
-          return await fn.apply(modelInstance, args);
-        } finally {
-          setIsGenerating(false);
-        }
-      },
-    [isReady, isGenerating, modelInstance]
+  // 2. Transcribe (Single Shot)
+  const transcribe = useCallback(
+    async (
+      waveform: Float32Array,
+      options: DecodingOptions = {}
+    ): Promise<TranscriptionResult> => {
+      if (!isReady) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'The model is currently not loaded.'
+        );
+      }
+      if (isGenerating) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModelGenerating,
+          'The model is currently generating.'
+        );
+      }
+
+      setIsGenerating(true);
+      try {
+        return await moduleInstance.transcribe(waveform, options);
+      } finally {
+        setIsGenerating(false);
+      }
+    },
+    [isReady, isGenerating, moduleInstance]
   );
 
+  // 3. Stream (Async Generator)
+  // This wraps the native generator to manage the 'isGenerating' state automatically
   const stream = useCallback(
-    async (options?: DecodingOptions & { enableTimestamps?: boolean }) => {
-      if (!isReady)
+    async function* (options: DecodingOptions = {}): AsyncGenerator<
+      {
+        committed: TranscriptionResult;
+        nonCommitted: TranscriptionResult;
+      },
+      void,
+      unknown
+    > {
+      if (!isReady) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded. Please load the model before calling this function.'
+          'The model is currently not loaded.'
         );
-      if (isGenerating)
+      }
+      if (isGenerating) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating. Please wait until previous model run is complete.'
+          'The model is currently generating.'
         );
+      }
 
       setIsGenerating(true);
-
-      const enableTimestamps = options?.enableTimestamps ?? false;
-      setCommittedTranscription(enableTimestamps ? [] : '');
-      setNonCommittedTranscription(enableTimestamps ? [] : '');
-
-      let fullResult: string | Word[] = enableTimestamps ? [] : '';
-
       try {
-        const streamGen = modelInstance.stream(
-          options as any
-        ) as AsyncGenerator<{
-          committed: string | Word[];
-          nonCommitted: string | Word[];
-        }>;
-
-        for await (const { committed, nonCommitted } of streamGen) {
-          if (typeof committed === 'string') {
-            const nc = nonCommitted as unknown as string;
-
-            if (committed.length > 0) {
-              setCommittedTranscription((prev) => {
-                const prevStr = typeof prev === 'string' ? prev : '';
-                return prevStr + committed;
-              });
-              (fullResult as string) += committed;
-            }
-            setNonCommittedTranscription(nc);
-          } else {
-            const committedWords = committed as Word[];
-            const nonCommittedWords = nonCommitted as Word[];
-
-            if (committedWords && committedWords.length > 0) {
-              setCommittedTranscription((prev) => {
-                const prevArr = Array.isArray(prev) ? prev : [];
-                return [...prevArr, ...committedWords];
-              });
-              (fullResult as Word[]).push(...committedWords);
-            }
-            setNonCommittedTranscription(nonCommittedWords);
-          }
+        const generator = moduleInstance.stream(options);
+        for await (const result of generator) {
+          yield result;
         }
       } finally {
         setIsGenerating(false);
       }
-      return fullResult;
     },
-    [isReady, isGenerating, modelInstance]
+    [isReady, isGenerating, moduleInstance]
   );
 
-  const wrapper = useCallback(
-    <T extends (...args: any[]) => any>(fn: T) => {
-      return (...args: Parameters<T>): ReturnType<T> => {
-        if (!isReady)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
-          );
-        return fn.apply(modelInstance, args);
-      };
+  // 4. Helper Wrappers (Synchronous)
+  const streamInsert = useCallback(
+    (waveform: Float32Array) => {
+      if (!isReady) return;
+      moduleInstance.streamInsert(waveform);
     },
-    [isReady, modelInstance]
+    [isReady, moduleInstance]
   );
 
+  const streamStop = useCallback(() => {
+    if (!isReady) return;
+    moduleInstance.streamStop();
+  }, [isReady, moduleInstance]);
+
   return {
     error,
     isReady,
     isGenerating,
     downloadProgress,
-    committedTranscription,
-    nonCommittedTranscription,
-    encode: stateWrapper(SpeechToTextModule.prototype.encode),
-    decode: stateWrapper(SpeechToTextModule.prototype.decode),
-    transcribe: stateWrapper(
-      SpeechToTextModule.prototype.transcribe
-    ) as SpeechToTextModule['transcribe'],
+    transcribe,
     stream,
-    streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
-    streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
+    streamInsert,
+    streamStop,
+    // Expose raw methods if needed, but 'transcribe' above is preferred
+    encode: moduleInstance.encode.bind(moduleInstance),
+    decode: moduleInstance.decode.bind(moduleInstance),
   };
 };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 6eb3bb36c..27c6d6a99 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -16,11 +16,6 @@ export class SpeechToTextModule {
   private nativeModule: any;
   private modelConfig!: SpeechToTextModelConfig;
 
-  private textDecoder = new TextDecoder('utf-8', {
-    fatal: false,
-    ignoreBOM: true,
-  });
-
   /**
    * Loads the model specified by the config object.
    * `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
@@ -106,21 +101,15 @@ export class SpeechToTextModule {
    * @returns The transcription string.
    */
   public async transcribe(
-<<<<<<< HEAD
     waveform: Float32Array,
-=======
-    waveform: Float32Array | number[],
->>>>>>> 6289290f (fix: Add corrections (still not working version))
     options: DecodingOptions = {}
   ): Promise<TranscriptionResult> {
     this.validateOptions(options);
-    const transcriptionBytes = await this.nativeModule.transcribe(
+    return await this.nativeModule.transcribe(
       waveform,
-      options.language || ''
+      options.language || '',
+      !!options.verbose
     );
-
-<<<<<<< HEAD
-    return transcriptionBytes;
   }
 
   /**
@@ -135,25 +124,9 @@ export class SpeechToTextModule {
    * @param options - Decoding options including language.
    * @returns An async generator yielding transcription updates.
    */
-  public async *stream(
-    options: DecodingOptions = {}
-  ): AsyncGenerator<{
-    committed: string | Word[];
-    nonCommitted: string | Word[];
-=======
-    return await this.nativeModule.transcribe(
-      waveform,
-      options.language || '',
-      !!options.verbose
-    );
-  }
-
-  public async *stream(
-    options: DecodingOptions = {}
-  ): AsyncGenerator<{
+  public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
     committed: TranscriptionResult;
     nonCommitted: TranscriptionResult;
->>>>>>> 6289290f (fix: Add corrections (still not working version))
   }> {
     this.validateOptions(options);
 

From a7741555619c96c522e3c676712b0f5667a55970 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 12:51:42 +0100
Subject: [PATCH 19/49] chore: Make demo app more intuitive

---
 .../components/VerboseTranscription.tsx       | 185 +++++++++---------
 apps/speech/screens/SpeechToTextScreen.tsx    |  39 +++-
 2 files changed, 125 insertions(+), 99 deletions(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index e90a90294..70bae16cd 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -1,10 +1,6 @@
 import React from 'react';
-import { View, Text, StyleSheet, Platform } from 'react-native';
-import {
-  TranscriptionSegment,
-  TranscriptionResult,
-  Word,
-} from 'react-native-executorch';
+import { View, Text, StyleSheet } from 'react-native';
+import { TranscriptionResult } from 'react-native-executorch';
 
 export const VerboseTranscription = ({
   data,
@@ -12,89 +8,107 @@ export const VerboseTranscription = ({
   data: TranscriptionResult;
 }) => {
   if (!data) return null;
-  console.log(data);
+
+  // 1. Strict checks to hide sections in non-verbose mode
+  const hasSegments = Array.isArray(data.segments) && data.segments.length > 0;
+
+  // check for valid language (ignore empty string or "N/A")
+  const hasLanguage =
+    !!data.language && data.language !== 'N/A' && data.language.trim() !== '';
+
+  // check for valid duration (ignore 0 or undefined)
+  const hasDuration = typeof data.duration === 'number' && data.duration > 0;
+
+  const hasMetadata = hasLanguage || hasDuration;
+
   return (
     <View style={styles.container}>
-      {/* Global Metadata */}
+      {/* Full Text is always shown */}
       <View style={styles.metaContainer}>
         <Text style={styles.label}>Full Text:</Text>
-        <Text style={styles.text}>{data.text}</Text>
+        <Text style={styles.text}>{data.text || ''}</Text>
 
-        <View style={styles.row}>
-          <Text style={styles.metaItem}>
-            Language: {data.language || 'N/A'}
-          </Text>
-          <Text style={styles.metaItem}>
-            Duration: {data.duration?.toFixed(2)}s
-          </Text>
-        </View>
+        {/* Only render this row if we have valid metadata */}
+        {hasMetadata && (
+          <View style={styles.row}>
+            {hasLanguage && (
+              <Text style={styles.metaItem}>Language: {data.language}</Text>
+            )}
+            {hasDuration && (
+              <Text style={styles.metaItem}>
+                Duration: {data.duration?.toFixed(2)}s
+              </Text>
+            )}
+          </View>
+        )}
       </View>
 
-      <Text style={styles.sectionHeader}>
-        Segments ({data.segments?.length || 0})
-      </Text>
+      {/* Only render Segments section if we actually have segments */}
+      {hasSegments && (
+        <>
+          <Text style={styles.sectionHeader}>
+            Segments ({data.segments?.length})
+          </Text>
 
-      {/* Segments List */}
-      {data.segments?.map((seg: TranscriptionSegment, index: number) => (
-        <View key={index} style={styles.segmentCard}>
-          <View style={styles.segmentHeader}>
-            <Text style={styles.timeBadge}>
-              {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
-            </Text>
-            <Text style={styles.segmentId}>ID: {index}</Text>
-          </View>
+          {data.segments?.map((seg, index) => (
+            <View key={index} style={styles.segmentCard}>
+              <View style={styles.segmentHeader}>
+                <Text style={styles.timeBadge}>
+                  {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
+                </Text>
+                <Text style={styles.segmentId}>ID: {seg.id ?? index}</Text>
+              </View>
 
-          <Text style={styles.segmentText}>"{seg.text}"</Text>
+              <Text style={styles.segmentText}>"{seg.text}"</Text>
 
-          {seg.words && seg.words.length > 0 && (
-            <View style={styles.wordsContainer}>
-              <Text style={styles.statLabel}>Word Timestamps:</Text>
-              <View style={styles.wordsGrid}>
-                {seg.words.map((w: Word, wIdx: number) => (
-                  <View key={wIdx} style={styles.wordChip}>
-                    <Text style={styles.wordText}>{w.word.trim()}</Text>
-                    <Text style={styles.wordTime}>{w.start.toFixed(2)}s</Text>
+              {/* Optional: Word Timestamps */}
+              {seg.words && seg.words.length > 0 && (
+                <View style={styles.wordsContainer}>
+                  <Text style={styles.statLabel}>Word Timestamps:</Text>
+                  <View style={styles.wordsGrid}>
+                    {seg.words.map((w, wIdx) => (
+                      <View key={wIdx} style={styles.wordChip}>
+                        <Text style={styles.wordText}>{w.word.trim()}</Text>
+                        <Text style={styles.wordTime}>
+                          {w.start.toFixed(2)}s
+                        </Text>
+                      </View>
+                    ))}
                   </View>
-                ))}
-              </View>
-            </View>
-          )}
+                </View>
+              )}
 
-          {/* Verbose Statistics */}
-          <View style={styles.statsGrid}>
-            <View style={styles.statItem}>
-              <Text style={styles.statLabel}>Avg LogProb</Text>
-              <Text style={styles.statValue}>
-                {seg.avg_logprob?.toFixed(4)}
-              </Text>
-            </View>
-            <View style={styles.statItem}>
-              <Text style={styles.statLabel}>No Speech</Text>
-              <Text style={styles.statValue}>
-                {seg.no_speech_prob?.toFixed(4)}
-              </Text>
-            </View>
-            <View style={styles.statItem}>
-              <Text style={styles.statLabel}>Temp</Text>
-              <Text style={styles.statValue}>
-                {seg.temperature?.toFixed(2)}
-              </Text>
-            </View>
-            <View style={styles.statItem}>
-              <Text style={styles.statLabel}>Compr.</Text>
-              <Text style={styles.statValue}>
-                {seg.compression_ratio?.toFixed(2)}
-              </Text>
+              {/* Optional: Verbose Stats */}
+              <View style={styles.statsGrid}>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>Avg LogProb</Text>
+                  <Text style={styles.statValue}>
+                    {seg.avg_logprob?.toFixed(4)}
+                  </Text>
+                </View>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>No Speech</Text>
+                  <Text style={styles.statValue}>
+                    {seg.no_speech_prob?.toFixed(4)}
+                  </Text>
+                </View>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>Temp</Text>
+                  <Text style={styles.statValue}>
+                    {seg.temperature?.toFixed(2)}
+                  </Text>
+                </View>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>Compr.</Text>
+                  <Text style={styles.statValue}>
+                    {seg.compression_ratio?.toFixed(2)}
+                  </Text>
+                </View>
+              </View>
             </View>
-          </View>
-
-          {/* Tokens (Optional: Remove if too noisy) */}
-          <View style={styles.tokensContainer}>
-            <Text style={styles.tokenLabel}>Tokens: </Text>
-            <Text style={styles.tokenList}>[{seg.tokens?.join(', ')}]</Text>
-          </View>
-        </View>
-      ))}
+          ))}
+        </>
+      )}
     </View>
   );
 };
@@ -121,7 +135,7 @@ const styles = StyleSheet.create({
   },
   row: {
     flexDirection: 'row',
-    justifyContent: 'space-between',
+    gap: 10,
     marginTop: 8,
   },
   metaItem: {
@@ -131,6 +145,7 @@ const styles = StyleSheet.create({
     paddingHorizontal: 8,
     paddingVertical: 2,
     borderRadius: 4,
+    overflow: 'hidden',
   },
   sectionHeader: {
     fontSize: 18,
@@ -200,22 +215,6 @@ const styles = StyleSheet.create({
     fontWeight: '600',
     color: '#444',
   },
-  tokensContainer: {
-    marginTop: 8,
-    paddingTop: 8,
-    borderTopWidth: 1,
-    borderTopColor: '#f0f0f0',
-  },
-  tokenLabel: {
-    fontSize: 11,
-    fontWeight: 'bold',
-    color: '#666',
-  },
-  tokenList: {
-    fontSize: 10,
-    color: '#888',
-    fontFamily: Platform.OS === 'ios' ? 'Courier' : 'monospace',
-  },
   wordsContainer: {
     marginVertical: 8,
     backgroundColor: '#f8f9fa',
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 1a3191ff9..07261d72c 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -99,7 +99,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
 
-      // API CHANGE: enableTimestamps -> verbose
       const result = await model.transcribe(audioBuffer, {
         verbose: enableTimestamps,
       });
@@ -135,12 +134,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.start();
 
     try {
-      // API CHANGE: Stream is now an AsyncGenerator
-      // API CHANGE: enableTimestamps -> verbose
       const streamIter = model.stream({ verbose: enableTimestamps });
 
       for await (const { committed, nonCommitted } of streamIter) {
-        if (!liveTranscribing) break; // Safety check
+        if (!liveTranscribing) break;
         setLiveResult({ committed, nonCommitted });
       }
     } catch (error) {
@@ -162,7 +159,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
           ...(liveResult.committed.segments || []),
           ...(liveResult.nonCommitted.segments || []),
         ],
-        // Required fields derived from last known state
         language: liveResult.committed.language || 'en',
         duration:
           (liveResult.committed.duration || 0) +
@@ -182,6 +178,27 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
+  // --- HELPER: Construct display data for live stream ---
+  const getDisplayData = () => {
+    if (liveTranscribing && liveResult) {
+      return {
+        text: liveResult.committed.text + liveResult.nonCommitted.text,
+        segments: [
+          ...(liveResult.committed.segments || []),
+          ...(liveResult.nonCommitted.segments || []),
+        ],
+        language: liveResult.committed.language,
+        duration:
+          (liveResult.committed.duration || 0) +
+          (liveResult.nonCommitted.duration || 0),
+      };
+    }
+    return transcription;
+  };
+
+  const displayData = getDisplayData();
+  // ----------------------------------------------------
+
   return (
     <SafeAreaProvider>
       <SafeAreaView style={styles.container}>
@@ -226,7 +243,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <VerboseTranscription data={transcription!} />
+              {displayData ? (
+                <VerboseTranscription data={displayData} />
+              ) : (
+                <Text style={styles.placeholderText}>
+                  No transcription yet...
+                </Text>
+              )}
             </ScrollView>
           </View>
 
@@ -342,6 +365,10 @@ const styles = StyleSheet.create({
     borderColor: '#0f186e',
     padding: 12,
   },
+  placeholderText: {
+    color: '#aaa',
+    fontStyle: 'italic',
+  },
   inputContainer: {
     marginBottom: 12,
   },

From e7dab906bc341c9e9f341c1c5d42a82a3552cfc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 14:55:15 +0100
Subject: [PATCH 20/49] chore: Check if all demo apps using Speech To Text
 works

---
 apps/speech/app.json                       |  30 ++++-
 apps/speech/package.json                   |   2 +-
 apps/speech/screens/SpeechToTextScreen.tsx | 134 ++++++++++++---------
 yarn.lock                                  |  21 +++-
 4 files changed, 122 insertions(+), 65 deletions(-)

diff --git a/apps/speech/app.json b/apps/speech/app.json
index 693c815cb..1e6e36464 100644
--- a/apps/speech/app.json
+++ b/apps/speech/app.json
@@ -17,6 +17,9 @@
       "bundleIdentifier": "com.anonymous.speech",
       "infoPlist": {
         "NSMicrophoneUsageDescription": "This app needs access to your microphone to record audio."
+      },
+      "entitlements": {
+        "com.apple.developer.kernel.increased-memory-limit": true
       }
     },
     "android": {
@@ -24,11 +27,34 @@
         "foregroundImage": "./assets/adaptive-icon.png",
         "backgroundColor": "#ffffff"
       },
-      "package": "com.anonymous.speech"
+      "package": "com.anonymous.speech",
+      "permissions": [
+        "android.permission.RECORD_AUDIO",
+        "android.permission.MODIFY_AUDIO_SETTINGS",
+        "android.permission.FOREGROUND_SERVICE",
+        "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK"
+      ]
     },
     "web": {
       "favicon": "./assets/favicon.png"
     },
-    "plugins": ["expo-font"]
+    "plugins": [
+      "expo-font",
+      [
+        "react-native-audio-api",
+        {
+          "iosBackgroundMode": true,
+          "iosMicrophonePermission": "This app requires access to the microphone to record audio.",
+          "androidPermissions": [
+            "android.permission.MODIFY_AUDIO_SETTINGS",
+            "android.permission.FOREGROUND_SERVICE",
+            "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK",
+            "android.permission.RECORD_AUDIO"
+          ],
+          "androidForegroundService": true,
+          "androidFSTypes": ["mediaPlayback", "microphone"]
+        }
+      ]
+    ]
   }
 }
diff --git a/apps/speech/package.json b/apps/speech/package.json
index 094fa2b78..d601d757a 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -19,7 +19,7 @@
     "metro-config": "^0.81.0",
     "react": "19.1.0",
     "react-native": "0.81.5",
-    "react-native-audio-api": "0.6.5",
+    "react-native-audio-api": "0.11.0",
     "react-native-device-info": "^14.0.4",
     "react-native-executorch": "workspace:*",
     "react-native-reanimated": "~4.1.1",
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 07261d72c..25108ce93 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -35,28 +35,24 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  // State now holds the new TranscriptionResult object or null
   const [transcription, setTranscription] =
     useState<TranscriptionResult | null>(null);
 
-  // State for live streaming results
   const [liveResult, setLiveResult] = useState<{
-    committed: TranscriptionResult;
-    nonCommitted: TranscriptionResult;
+    fullText: string;
+    segments: any[];
   } | null>(null);
 
   const [enableTimestamps, setEnableTimestamps] = useState(false);
   const [audioURL, setAudioURL] = useState('');
+
+  // Ref to track recording state (fixes the loop breaking immediately)
+  const isRecordingRef = useRef(false);
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
 
-  const [recorder] = useState(
-    () =>
-      new AudioRecorder({
-        sampleRate: 16000,
-        bufferLengthInSamples: 1600,
-      })
-  );
+  // 1. Keping your exact AudioRecorder setup
+  const [recorder] = useState(() => new AudioRecorder());
 
   useEffect(() => {
     AudioManager.setAudioSessionOptions({
@@ -64,7 +60,11 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       iosMode: 'spokenAudio',
       iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
     });
-    AudioManager.requestRecordingPermissions();
+    const checkPerms = async () => {
+      const granted = await AudioManager.requestRecordingPermissions();
+      if (!granted) console.warn('Microphone permission denied!');
+    };
+    checkPerms();
   }, []);
 
   async function getAudioFile(sourceUri: string) {
@@ -96,73 +96,97 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     const audioContext = new AudioContext({ sampleRate: 16000 });
 
     try {
-      const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+      const decodedAudioData = await audioContext.decodeAudioData(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
-
       const result = await model.transcribe(audioBuffer, {
         verbose: enableTimestamps,
       });
       setTranscription(result);
     } catch (error) {
       console.error('Error decoding audio data', error);
-      console.warn('Note: Supported file formats: mp3, wav, flac');
       return;
     }
   };
 
   const handleStartTranscribeFromMicrophone = async () => {
+    // 2. Set ref to true so the loop knows we are running
+    isRecordingRef.current = true;
     setLiveTranscribing(true);
+
     setTranscription(null);
-    setLiveResult({
-      committed: {
-        text: '',
-        language: 'en',
-        duration: 0,
-        segments: [],
-      },
-      nonCommitted: {
-        text: '',
-        language: 'en',
-        duration: 0,
-        segments: [],
+    setLiveResult({ fullText: '', segments: [] });
+
+    const sampleRate = 16000;
+
+    // 3. Your exact configuration structure
+    recorder.onAudioReady(
+      {
+        sampleRate,
+        bufferLength: 0.1 * sampleRate, // 0.1s of data
+        channelCount: 1,
       },
-    });
+      ({ buffer }) => {
+        // console.log('Audio chunk size:', buffer.buffer.length);
+        model.streamInsert(buffer.getChannelData(0));
+      }
+    );
 
-    recorder.onAudioReady(({ buffer }) => {
-      model.streamInsert(buffer.getChannelData(0));
-    });
-    recorder.start();
+    try {
+      await recorder.start();
+    } catch (e) {
+      console.error('Failed to start recorder', e);
+      isRecordingRef.current = false;
+      setLiveTranscribing(false);
+      return;
+    }
+
+    let accumulatedText = '';
+    let accumulatedSegments: any[] = [];
 
     try {
       const streamIter = model.stream({ verbose: enableTimestamps });
 
       for await (const { committed, nonCommitted } of streamIter) {
-        if (!liveTranscribing) break;
-        setLiveResult({ committed, nonCommitted });
+        // 4. CRITICAL FIX: Check the REF, not the stale variable
+        if (!isRecordingRef.current) break;
+
+        if (committed.text) {
+          accumulatedText += committed.text;
+        }
+        if (committed.segments) {
+          accumulatedSegments = [...accumulatedSegments, ...committed.segments];
+        }
+
+        const currentDisplay = {
+          fullText: accumulatedText + nonCommitted.text,
+          segments: [...accumulatedSegments, ...(nonCommitted.segments || [])],
+        };
+
+        setLiveResult(currentDisplay);
       }
     } catch (error) {
       console.error('Error during live transcription:', error);
+    } finally {
+      setLiveTranscribing(false);
     }
   };
 
   const handleStopTranscribeFromMicrophone = () => {
+    // 5. Update Ref to stop the loop
+    isRecordingRef.current = false;
+
     recorder.stop();
     model.streamStop();
     console.log('Live transcription stopped');
     setLiveTranscribing(false);
 
-    // Move live result to final transcription state
     if (liveResult) {
+      // 6. Ensure full TranscriptionResult object is returned
       setTranscription({
-        text: liveResult.committed.text + liveResult.nonCommitted.text,
-        segments: [
-          ...(liveResult.committed.segments || []),
-          ...(liveResult.nonCommitted.segments || []),
-        ],
-        language: liveResult.committed.language || 'en',
-        duration:
-          (liveResult.committed.duration || 0) +
-          (liveResult.nonCommitted.duration || 0),
+        text: liveResult.fullText,
+        segments: liveResult.segments,
+        language: 'en',
+        duration: 0,
       });
       setLiveResult(null);
     }
@@ -178,26 +202,19 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
-  // --- HELPER: Construct display data for live stream ---
-  const getDisplayData = () => {
+  const getDisplayData = (): TranscriptionResult | null => {
     if (liveTranscribing && liveResult) {
       return {
-        text: liveResult.committed.text + liveResult.nonCommitted.text,
-        segments: [
-          ...(liveResult.committed.segments || []),
-          ...(liveResult.nonCommitted.segments || []),
-        ],
-        language: liveResult.committed.language,
-        duration:
-          (liveResult.committed.duration || 0) +
-          (liveResult.nonCommitted.duration || 0),
+        text: liveResult.fullText,
+        segments: liveResult.segments,
+        language: 'en',
+        duration: 0,
       };
     }
     return transcription;
   };
 
   const displayData = getDisplayData();
-  // ----------------------------------------------------
 
   return (
     <SafeAreaProvider>
@@ -247,7 +264,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 <VerboseTranscription data={displayData} />
               ) : (
                 <Text style={styles.placeholderText}>
-                  No transcription yet...
+                  {liveTranscribing
+                    ? 'Listening...'
+                    : 'No transcription yet...'}
                 </Text>
               )}
             </ScrollView>
@@ -364,6 +383,7 @@ const styles = StyleSheet.create({
     borderWidth: 1,
     borderColor: '#0f186e',
     padding: 12,
+    maxHeight: 400, // Added limit
   },
   placeholderText: {
     color: '#aaa',
diff --git a/yarn.lock b/yarn.lock
index 1ca8d5d29..23a8f6f5a 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -12847,15 +12847,17 @@ __metadata:
   languageName: node
   linkType: hard
 
-"react-native-audio-api@npm:0.6.5":
-  version: 0.6.5
-  resolution: "react-native-audio-api@npm:0.6.5"
+"react-native-audio-api@npm:0.11.0":
+  version: 0.11.0
+  resolution: "react-native-audio-api@npm:0.11.0"
+  dependencies:
+    semver: "npm:^7.7.3"
   peerDependencies:
     react: "*"
     react-native: "*"
   bin:
     setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
-  checksum: 10/9bf5b124ff902f359a237bcd3c386a37b354cc6263ce66765c1788c7a8d42c307a133780c8b57ab2f0db530bfed8ac1d3ff8fb55055228854ccebc8da9a595d8
+  checksum: 10/9cb7c9afce098e6b39ba2e8c80c3085b9d21cc0b18bf19643e7376fce3ddf5dd8ec5c02873c0d4d6b3dcd9d9f2588633e2ed9a0c42c2d506980d34a3a02c6ef8
   languageName: node
   linkType: hard
 
@@ -13708,6 +13710,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"semver@npm:^7.7.3":
+  version: 7.7.4
+  resolution: "semver@npm:7.7.4"
+  bin:
+    semver: bin/semver.js
+  checksum: 10/26bdc6d58b29528f4142d29afb8526bc335f4fc04c4a10f2b98b217f277a031c66736bf82d3d3bb354a2f6a3ae50f18fd62b053c4ac3f294a3d10a61f5075b75
+  languageName: node
+  linkType: hard
+
 "semver@npm:~7.6.3":
   version: 7.6.3
   resolution: "semver@npm:7.6.3"
@@ -14089,7 +14100,7 @@ __metadata:
     metro-config: "npm:^0.81.0"
     react: "npm:19.1.0"
     react-native: "npm:0.81.5"
-    react-native-audio-api: "npm:0.6.5"
+    react-native-audio-api: "npm:0.11.0"
     react-native-device-info: "npm:^14.0.4"
     react-native-executorch: "workspace:*"
     react-native-reanimated: "npm:~4.1.1"

From e8af8b5e539f3bc9ba394d1a55aa242a0a3adf95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 15:34:35 +0100
Subject: [PATCH 21/49] chore: Clean files

---
 apps/speech/screens/SpeechToTextScreen.tsx    |  12 +-
 .../host_objects/JsiConversions.h             |  63 +--------
 .../models/speech_to_text/SpeechToText.cpp    | 131 +-----------------
 .../useSpeechToText.ts                        |   7 -
 4 files changed, 9 insertions(+), 204 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 25108ce93..004cd85fe 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -46,12 +46,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [enableTimestamps, setEnableTimestamps] = useState(false);
   const [audioURL, setAudioURL] = useState('');
 
-  // Ref to track recording state (fixes the loop breaking immediately)
   const isRecordingRef = useRef(false);
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
 
-  // 1. Keping your exact AudioRecorder setup
   const [recorder] = useState(() => new AudioRecorder());
 
   useEffect(() => {
@@ -109,7 +107,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   };
 
   const handleStartTranscribeFromMicrophone = async () => {
-    // 2. Set ref to true so the loop knows we are running
     isRecordingRef.current = true;
     setLiveTranscribing(true);
 
@@ -118,15 +115,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
 
     const sampleRate = 16000;
 
-    // 3. Your exact configuration structure
     recorder.onAudioReady(
       {
         sampleRate,
-        bufferLength: 0.1 * sampleRate, // 0.1s of data
+        bufferLength: 0.1 * sampleRate,
         channelCount: 1,
       },
       ({ buffer }) => {
-        // console.log('Audio chunk size:', buffer.buffer.length);
         model.streamInsert(buffer.getChannelData(0));
       }
     );
@@ -147,7 +142,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const streamIter = model.stream({ verbose: enableTimestamps });
 
       for await (const { committed, nonCommitted } of streamIter) {
-        // 4. CRITICAL FIX: Check the REF, not the stale variable
         if (!isRecordingRef.current) break;
 
         if (committed.text) {
@@ -172,7 +166,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   };
 
   const handleStopTranscribeFromMicrophone = () => {
-    // 5. Update Ref to stop the loop
     isRecordingRef.current = false;
 
     recorder.stop();
@@ -181,7 +174,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     setLiveTranscribing(false);
 
     if (liveResult) {
-      // 6. Ensure full TranscriptionResult object is returned
       setTranscription({
         text: liveResult.fullText,
         segments: liveResult.segments,
@@ -383,7 +375,7 @@ const styles = StyleSheet.create({
     borderWidth: 1,
     borderColor: '#0f186e',
     padding: 12,
-    maxHeight: 400, // Added limit
+    maxHeight: 400,
   },
   placeholderText: {
     color: '#aaa',
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 3bf9d701c..ef09f25a3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -20,17 +20,14 @@
 #include <rnexecutorch/models/ocr/Types.h>
 #include <rnexecutorch/models/speech_to_text/types/Segment.h>
 #include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
-// #include <rnexecutorch/models/speech_to_text/types/Word.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
-// using namespace rnexecutorch::models::speech_to_text::types;
+using namespace rnexecutorch::models::speech_to_text::types;
 
 namespace rnexecutorch::jsi_conversion {
 
 using namespace facebook;
 
-// #error "I AM READING THE CORRECT FILE"
-
 // Conversion from jsi to C++ types --------------------------------------------
 
 template <typename T> T getValue(const jsi::Value &val, jsi::Runtime &runtime);
@@ -318,24 +315,6 @@ inline jsi::Value getJsiValue(std::shared_ptr<jsi::Object> valuePtr,
   return std::move(*valuePtr);
 }
 
-// inline jsi::Value getJsiValue(const Word &word, jsi::Runtime &runtime) {
-//   jsi::Object obj(runtime);
-//   obj.setProperty(runtime, "word",
-//                   jsi::String::createFromUtf8(runtime, word.content));
-//   obj.setProperty(runtime, "start", static_cast<double>(word.start));
-//   obj.setProperty(runtime, "end", static_cast<double>(word.end));
-//   return obj;
-// }
-
-// inline jsi::Value getJsiValue(const std::vector<Word> &vec,
-//                               jsi::Runtime &runtime) {
-//   jsi::Array array(runtime, vec.size());
-//   for (size_t i = 0; i < vec.size(); ++i) {
-//     array.setValueAtIndex(runtime, i, getJsiValue(vec[i], runtime));
-//   }
-//   return {runtime, array};
-// }
-
 inline jsi::Value getJsiValue(const std::vector<int32_t> &vec,
                               jsi::Runtime &runtime) {
   jsi::Array array(runtime, vec.size());
@@ -523,37 +502,7 @@ getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
   return jsiSegments;
 }
 
-// inline jsi::Value getJsiValue(const
-// rnexecutorch::models::speech_to_text::types::Segment &seg, jsi::Runtime
-// &runtime) {
-//   jsi::Object obj(runtime);
-//   obj.setProperty(runtime, "start", seg.start);
-//   obj.setProperty(runtime, "end", seg.end);
-
-//   std::string segText;
-//   for (auto &w : seg.words)
-//     segText += w.content;
-//   obj.setProperty(runtime, "text",
-//                   jsi::String::createFromUtf8(runtime, segText));
-
-//   obj.setProperty(runtime, "avg_logprob", seg.avgLogprob);
-//   obj.setProperty(runtime, "compression_ratio", seg.compressionRatio);
-//   obj.setProperty(runtime, "temperature", seg.temperature);
-//   obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
-
-//   jsi::Array tokensAry(runtime, seg.tokens.size());
-//   for (size_t i = 0; i < seg.tokens.size(); ++i) {
-//     tokensAry.setValueAtIndex(runtime, i,
-//     static_cast<double>(seg.tokens[i]));
-//   }
-//   obj.setProperty(runtime, "tokens", tokensAry);
-
-//   return obj;
-// }
-
-inline jsi::Value
-getJsiValue(const rnexecutorch::models::speech_to_text::types::Segment &seg,
-            jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
   obj.setProperty(runtime, "start", seg.start);
   obj.setProperty(runtime, "end", seg.end);
@@ -569,7 +518,6 @@ getJsiValue(const rnexecutorch::models::speech_to_text::types::Segment &seg,
   obj.setProperty(runtime, "temperature", seg.temperature);
   obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
 
-  // --- NEW: EXPORT WORD TIMESTAMPS ---
   jsi::Array wordsAry(runtime, seg.words.size());
   for (size_t i = 0; i < seg.words.size(); ++i) {
     jsi::Object wordObj(runtime);
@@ -583,7 +531,6 @@ getJsiValue(const rnexecutorch::models::speech_to_text::types::Segment &seg,
     wordsAry.setValueAtIndex(runtime, i, wordObj);
   }
   obj.setProperty(runtime, "words", wordsAry);
-  // -----------------------------------
 
   jsi::Array tokensAry(runtime, seg.tokens.size());
   for (size_t i = 0; i < seg.tokens.size(); ++i) {
@@ -594,10 +541,8 @@ getJsiValue(const rnexecutorch::models::speech_to_text::types::Segment &seg,
   return obj;
 }
 
-inline jsi::Value getJsiValue(
-    const rnexecutorch::models::speech_to_text::types::TranscriptionResult
-        &result,
-    jsi::Runtime &runtime) {
+inline jsi::Value getJsiValue(const TranscriptionResult &result,
+                              jsi::Runtime &runtime) {
   jsi::Object obj(runtime);
   obj.setProperty(runtime, "text",
                   jsi::String::createFromUtf8(runtime, result.text));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 868fb3f45..feac315f0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -44,69 +44,6 @@ SpeechToText::decode(std::span<uint64_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-// std::vector<Word> SpeechToText::transcribe(std::span<float> waveform,
-//                                            std::string languageOption) const
-//                                            {
-//   std::vector<Segment> segments =
-//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
-//   std::vector<Word> transcription;
-
-//   size_t transcriptionLength = 0;
-//   for (auto &segment : segments) {
-//     transcriptionLength += segment.words.size();
-//   }
-
-//   transcription.reserve(segments.size());
-
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcription.push_back(word);
-//     }
-//   }
-
-//   return transcription;
-// }
-
-// std::vector<char>
-// SpeechToText::transcribeStringOnly(std::span<float> waveform,
-//                                    std::string languageOption) const {
-//   std::vector<Segment> segments =
-//       this->asr->transcribe(waveform, DecodingOptions(languageOption));
-//   std::string transcription;
-
-//   size_t transcriptionLength = 0;
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcriptionLength += word.content.size();
-//     }
-//   }
-//   transcription.reserve(transcriptionLength);
-
-//   for (auto &segment : segments) {
-//     for (auto &word : segment.words) {
-//       transcription += word.content;
-//     }
-//   }
-
-//   return {transcription.begin(), transcription.end()};
-// }
-
-// std::vector<char> mergeWordsToString(const std::vector<Word> &words) {
-//   std::string result;
-//   size_t totalLength = 0;
-
-//   for (const auto &word : words) {
-//     totalLength += word.content.size();
-//   }
-//   result.reserve(totalLength);
-
-//   for (const auto &word : words) {
-//     result += word.content;
-//   }
-
-//   return {result.begin(), result.end()};
-// }
-
 TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
                                              std::string languageOption,
                                              bool verbose) const {
@@ -136,62 +73,7 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
          this->decoder->getMemoryLowerBound();
 }
 
-// void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-//                           std::string languageOption, bool enableTimestamps)
-//                           {
-//   if (this->isStreaming) {
-//     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
-//                             "Streaming is already in progress!");
-//   }
-
-//   auto nativeCallback = [this, callback](const auto &committedVec,
-//                                          const auto &nonCommittedVec,
-//                                          bool isDone) {
-//     this->callInvoker->invokeAsync(
-//         [callback, committedVec, nonCommittedVec, isDone](jsi::Runtime &rt) {
-//           jsi::Value committedJsi =
-//               rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt);
-//           jsi::Value nonCommittedJsi =
-//               rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt);
-
-//           callback->call(rt, std::move(committedJsi),
-//                          std::move(nonCommittedJsi), jsi::Value(isDone));
-//         });
-//   };
-
-//   this->isStreaming = true;
-//   while (this->isStreaming) {
-//     if (!this->readyToProcess ||
-//         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples)
-//         {
-//       std::this_thread::sleep_for(std::chrono::milliseconds(100));
-//       continue;
-//     }
-//     ProcessResult res =
-//         this->processor->processIter(DecodingOptions(languageOption));
-
-//     if (enableTimestamps) {
-//       nativeCallback(res.committed, res.nonCommitted, false);
-//     } else {
-//       nativeCallback(mergeWordsToString(res.committed),
-//                      mergeWordsToString(res.nonCommitted), false);
-//     }
-//     this->readyToProcess = false;
-//   }
-
-//   std::vector<Word> committed = this->processor->finish();
-
-//   if (enableTimestamps) {
-//     nativeCallback(committed, std::vector<Word>{}, true);
-//   } else {
-//     nativeCallback(mergeWordsToString(committed), std::vector<char>(), true);
-//   }
-
-//   this->resetStreamState();
-// }
-
 namespace {
-// Helper to convert a list of Words (from streaming) into the API Result format
 TranscriptionResult wordsToResult(const std::vector<Word> &words,
                                   const std::string &language, bool verbose) {
   TranscriptionResult res;
@@ -204,18 +86,12 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
   }
   res.text = fullText;
 
-  // 2. Build Verbose Segment
-  // Since OnlineASRProcessor only gives us Words, we create a single
-  // "Segment" containing all these words for the current chunk.
   if (verbose && !words.empty()) {
     Segment seg;
     seg.start = words.front().start;
     seg.end = words.back().end;
     seg.words = words;
 
-    // Note: 'tokens', 'avgLogprob', etc. are missing in the 'Word' struct,
-    // so they will remain empty/default here.
-
     res.segments.push_back(std::move(seg));
   }
 
@@ -225,7 +101,9 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                           std::string languageOption, bool verbose) {
-  if (this->isStreaming) { /* error... */
+  if (this->isStreaming) {
+    throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
+                            "Streaming is already in progress!");
   }
 
   // Lambda that constructs the C++ structs (thread-safe, no JSI here)
@@ -256,16 +134,13 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
       continue;
     }
 
-    // 1. Get the Vector of Words
     ProcessResult res = this->processor->processIter(options);
 
-    // 2. Convert Vectors to TranscriptionResult structs
     TranscriptionResult cRes =
         wordsToResult(res.committed, languageOption, verbose);
     TranscriptionResult ncRes =
         wordsToResult(res.nonCommitted, languageOption, verbose);
 
-    // 3. Pass to callback
     nativeCallback(cRes, ncRes, false);
     this->readyToProcess = false;
   }
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index aa1ba7e87..55ac3ce46 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -25,10 +25,8 @@ export const useSpeechToText = ({
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
 
-  // Use useMemo to ensure the module instance is stable
   const moduleInstance = useMemo(() => new SpeechToTextModule(), []);
 
-  // 1. Load Model Effect
   useEffect(() => {
     if (preventLoad) return;
     let isMounted = true;
@@ -68,7 +66,6 @@ export const useSpeechToText = ({
     preventLoad,
   ]);
 
-  // 2. Transcribe (Single Shot)
   const transcribe = useCallback(
     async (
       waveform: Float32Array,
@@ -97,8 +94,6 @@ export const useSpeechToText = ({
     [isReady, isGenerating, moduleInstance]
   );
 
-  // 3. Stream (Async Generator)
-  // This wraps the native generator to manage the 'isGenerating' state automatically
   const stream = useCallback(
     async function* (options: DecodingOptions = {}): AsyncGenerator<
       {
@@ -134,7 +129,6 @@ export const useSpeechToText = ({
     [isReady, isGenerating, moduleInstance]
   );
 
-  // 4. Helper Wrappers (Synchronous)
   const streamInsert = useCallback(
     (waveform: Float32Array) => {
       if (!isReady) return;
@@ -157,7 +151,6 @@ export const useSpeechToText = ({
     stream,
     streamInsert,
     streamStop,
-    // Expose raw methods if needed, but 'transcribe' above is preferred
     encode: moduleInstance.encode.bind(moduleInstance),
     decode: moduleInstance.decode.bind(moduleInstance),
   };

From 3e169d21d0c60aaf1d70d9b732acc7fbd7de57c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 15:40:13 +0100
Subject: [PATCH 22/49] chore: Further cleaning

---
 .../rnexecutorch/host_objects/JsiConversions.h    | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index ef09f25a3..4518d65dc 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -66,21 +66,6 @@ getValue<std::shared_ptr<jsi::Function>>(const jsi::Value &val,
       val.asObject(runtime).asFunction(runtime));
 }
 
-// template <>
-// inline Word getValue<Word>(const jsi::Value &val, jsi::Runtime &runtime) {
-//   jsi::Object obj = val.asObject(runtime);
-
-//   std::string content =
-//       getValue<std::string>(obj.getProperty(runtime, "word"), runtime);
-
-//   double start = obj.getProperty(runtime, "start").asNumber();
-//   double end = obj.getProperty(runtime, "end").asNumber();
-
-//   return Word{.content = std::move(content),
-//               .start = static_cast<float>(start),
-//               .end = static_cast<float>(end)};
-// }
-
 template <>
 inline JSTensorViewIn getValue<JSTensorViewIn>(const jsi::Value &val,
                                                jsi::Runtime &runtime) {

From 843b8198cd910ed998dda06ce0b34fc724de99d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 15:43:48 +0100
Subject: [PATCH 23/49] chore: Further cleaning

---
 apps/llm/app/voice_chat/index.tsx                   | 13 ++-----------
 apps/speech/components/VerboseTranscription.tsx     |  8 --------
 .../SpeechToTextModule.ts                           |  6 ------
 3 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
index 02a919201..27b4ad35d 100644
--- a/apps/llm/app/voice_chat/index.tsx
+++ b/apps/llm/app/voice_chat/index.tsx
@@ -35,7 +35,6 @@ export default function VoiceChatScreenWrapper() {
 
 function VoiceChatScreen() {
   const [isRecording, setIsRecording] = useState(false);
-  // Local state to track the text while streaming
   const [liveTranscription, setLiveTranscription] = useState('');
 
   const [recorder] = useState(
@@ -68,15 +67,12 @@ function VoiceChatScreen() {
 
   const handleRecordPress = async () => {
     if (isRecording) {
-      // STOP RECORDING
       setIsRecording(false);
       recorder.stop();
-      // This will cause the generator loop below to break
       speechToText.streamStop();
     } else {
-      // START RECORDING
       setIsRecording(true);
-      setLiveTranscription(''); // Reset previous text
+      setLiveTranscription('');
 
       recorder.onAudioReady(({ buffer }) => {
         speechToText.streamInsert(buffer.getChannelData(0));
@@ -86,9 +82,7 @@ function VoiceChatScreen() {
       let finalResult = '';
 
       try {
-        // Iterate over the async generator
         for await (const result of speechToText.stream()) {
-          // Combine committed and non-committed text for live preview
           const text = result.committed.text + result.nonCommitted.text;
           setLiveTranscription(text);
           finalResult = text;
@@ -96,10 +90,9 @@ function VoiceChatScreen() {
       } catch (e) {
         console.error('Streaming error:', e);
       } finally {
-        // When the loop breaks (streamStop called), send to LLM
         if (finalResult.trim().length > 0) {
           await llm.sendMessage(finalResult);
-          setLiveTranscription(''); // Clear after sending
+          setLiveTranscription('');
         }
       }
     }
@@ -122,12 +115,10 @@ function VoiceChatScreen() {
           <Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
         </View>
 
-        {/* Show history OR if we are currently recording/have text */}
         {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
           <View style={styles.chatContainer}>
             <Messages
               chatHistory={
-                // If we are recording, temporarily append the live user message
                 isRecording && liveTranscription.length > 0
                   ? [
                       ...llm.messageHistory,
diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 70bae16cd..c0337c8cc 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -9,26 +9,21 @@ export const VerboseTranscription = ({
 }) => {
   if (!data) return null;
 
-  // 1. Strict checks to hide sections in non-verbose mode
   const hasSegments = Array.isArray(data.segments) && data.segments.length > 0;
 
-  // check for valid language (ignore empty string or "N/A")
   const hasLanguage =
     !!data.language && data.language !== 'N/A' && data.language.trim() !== '';
 
-  // check for valid duration (ignore 0 or undefined)
   const hasDuration = typeof data.duration === 'number' && data.duration > 0;
 
   const hasMetadata = hasLanguage || hasDuration;
 
   return (
     <View style={styles.container}>
-      {/* Full Text is always shown */}
       <View style={styles.metaContainer}>
         <Text style={styles.label}>Full Text:</Text>
         <Text style={styles.text}>{data.text || ''}</Text>
 
-        {/* Only render this row if we have valid metadata */}
         {hasMetadata && (
           <View style={styles.row}>
             {hasLanguage && (
@@ -43,7 +38,6 @@ export const VerboseTranscription = ({
         )}
       </View>
 
-      {/* Only render Segments section if we actually have segments */}
       {hasSegments && (
         <>
           <Text style={styles.sectionHeader}>
@@ -61,7 +55,6 @@ export const VerboseTranscription = ({
 
               <Text style={styles.segmentText}>"{seg.text}"</Text>
 
-              {/* Optional: Word Timestamps */}
               {seg.words && seg.words.length > 0 && (
                 <View style={styles.wordsContainer}>
                   <Text style={styles.statLabel}>Word Timestamps:</Text>
@@ -78,7 +71,6 @@ export const VerboseTranscription = ({
                 </View>
               )}
 
-              {/* Optional: Verbose Stats */}
               <View style={styles.statsGrid}>
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Avg LogProb</Text>
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 27c6d6a99..4d604dd27 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -130,7 +130,6 @@ export class SpeechToTextModule {
   }> {
     this.validateOptions(options);
 
-    // Prepare arguments for native call
     const verbose = !!options.verbose;
     const language = options.language || '';
 
@@ -148,7 +147,6 @@ export class SpeechToTextModule {
       waiter = null;
     };
 
-    // Start the native streaming process in the background
     (async () => {
       try {
         await this.nativeModule.stream(
@@ -157,8 +155,6 @@ export class SpeechToTextModule {
             nonCommitted: TranscriptionResult,
             isDone: boolean
           ) => {
-            // The native module now returns ready-to-use JS objects via JSI.
-            // No TextDecoder or JSON parsing required.
             queue.push({
               committed,
               nonCommitted,
@@ -173,7 +169,6 @@ export class SpeechToTextModule {
           verbose
         );
 
-        // Mark as finished when the native promise resolves (stream ends)
         finished = true;
         wake();
       } catch (e) {
@@ -183,7 +178,6 @@ export class SpeechToTextModule {
       }
     })();
 
-    // Consumer Loop
     while (true) {
       if (queue.length > 0) {
         yield queue.shift()!;

From cdc4f7bbd6baa10820149b2c5e0219c30bf0eaad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 15:52:22 +0100
Subject: [PATCH 24/49] chore: Cleaning once again

---
 .../common/rnexecutorch/models/speech_to_text/asr/ASR.cpp  | 7 +------
 .../models/speech_to_text/types/DecodingOptions.h          | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 0beccf021..b394e2b96 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -112,7 +112,6 @@ ASR::generateWithFallback(std::span<float> waveform,
     const std::string text = this->tokenizer->decode(tokens, true);
     const float compressionRatio = this->getCompressionRatio(text);
 
-    // Naive heuristic update - ensure we capture the stats of the chosen path
     if (avgLogProb >= -1.0f && compressionRatio < 2.4f) {
       bestTokens = std::move(tokens);
       bestAvgLogProb = avgLogProb;
@@ -121,7 +120,6 @@ ASR::generateWithFallback(std::span<float> waveform,
       break;
     }
 
-    // Fallback logic (simplify for brevity: keep last if none pass)
     if (t == temperatures.back() && bestTokens.empty()) {
       bestTokens = std::move(tokens);
       bestAvgLogProb = avgLogProb;
@@ -130,7 +128,6 @@ ASR::generateWithFallback(std::span<float> waveform,
     }
   }
 
-  // Pass metadata to calculation
   return this->calculateWordLevelTimestamps(bestTokens, waveform,
                                             bestAvgLogProb, bestTemperature,
                                             bestCompressionRatio);
@@ -162,11 +159,9 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
       const uint64_t end = generatedTokens[i - 1];
       auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
       if (words.size()) {
-        // segments.emplace_back(std::move(words), 0.0);
         Segment seg;
         seg.words = std::move(words);
-        // Initialize new fields with defaults or passed values
-        seg.tokens = {}; // Empty for word-level approximation
+        seg.tokens = {};
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
         seg.compressionRatio = compressionRatio;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
index 5126f21a1..99774cf52 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
@@ -14,4 +14,4 @@ struct DecodingOptions {
   bool verbose;
 };
 
-} // namespace rnexecutorch::models::speech_to_text::types
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::types

From dc9a5617585ba60b2224b30bbe71a79b0f4e44d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 18:23:18 +0100
Subject: [PATCH 25/49] chore: post-rebase cleaning

---
 .../components/VerboseTranscription.tsx       |  2 +-
 .../models/speech_to_text/SpeechToText.cpp    |  2 -
 .../models/speech_to_text/types/Segment.h     |  4 +-
 .../react-native-executorch/src/types/stt.ts  | 70 ++++++++++++++-----
 4 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index c0337c8cc..fd608dfa5 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -50,7 +50,7 @@ export const VerboseTranscription = ({
                 <Text style={styles.timeBadge}>
                   {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
                 </Text>
-                <Text style={styles.segmentId}>ID: {seg.id ?? index}</Text>
+                <Text style={styles.segmentId}>ID: {index}</Text>
               </View>
 
               <Text style={styles.segmentText}>"{seg.text}"</Text>
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index feac315f0..3fe2634e1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -79,7 +79,6 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
   TranscriptionResult res;
   res.language = language;
 
-  // 1. Flatten text
   std::string fullText;
   for (const auto &w : words) {
     fullText += w.content;
@@ -106,7 +105,6 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
                             "Streaming is already in progress!");
   }
 
-  // Lambda that constructs the C++ structs (thread-safe, no JSI here)
   auto nativeCallback = [this, callback,
                          verbose](const TranscriptionResult &committed,
                                   const TranscriptionResult &nonCommitted,
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
index 3570eb127..14876625e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
@@ -1,13 +1,13 @@
 #pragma once
 
-#include <vector>
 #include "Word.h"
+#include <vector>
 
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct Segment {
   std::vector<Word> words;
-  std::vector<int32_t> tokens; // Raw token IDs
+  std::vector<uint64_t> tokens; // Raw token IDs
   float start;
   float end;
   float avgLogprob;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index d568cb2da..ad7d3fd49 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -43,18 +43,6 @@ export interface SpeechToTextType {
    */
   downloadProgress: number;
 
-  /**
-   * Contains the part of the transcription that is finalized and will not change.
-   * Useful for displaying stable results during streaming.
-   */
-  committedTranscription: string;
-
-  /**
-   * Contains the part of the transcription that is still being processed and may change.
-   * Useful for displaying live, partial results during streaming.
-   */
-  nonCommittedTranscription: string;
-
   /**
    * Runs the encoding part of the model on the provided waveform.
    * @param waveform - The input audio waveform array.
@@ -76,22 +64,31 @@ export interface SpeechToTextType {
   /**
    * Starts a transcription process for a given input array, which should be a waveform at 16kHz.
    * @param waveform - The input audio waveform.
-   * @param options - Decoding options, e.g. `{ language: 'es' }` for multilingual models.
-   * @returns Resolves a promise with the output transcription when the model is finished.
+   * @param options - Decoding options, check API reference for more details.
+   * @returns Resolves a promise with the output transcription. Result of transcription is
+   * object of type `TranscriptionResult`.
    */
   transcribe(
     waveform: Float32Array,
     options?: DecodingOptions | undefined
-  ): Promise<string>;
+  ): Promise<TranscriptionResult>;
 
   /**
    * Starts a streaming transcription process.
    * Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
    * Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.
    * @param options - Decoding options including language.
-   * @returns The final transcription string.
+   * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
+   * Both `committed` and `nonCommitted` are of type `TranscriptionResult`
    */
-  stream(options?: DecodingOptions | undefined): Promise<string>;
+  stream(options?: DecodingOptions | undefined): AsyncGenerator<
+    {
+      committed: TranscriptionResult;
+      nonCommitted: TranscriptionResult;
+    },
+    void,
+    unknown
+  >;
 
   /**
    * Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
@@ -192,21 +189,45 @@ export type SpeechToTextLanguage =
  *
  * @category Types
  * @property {SpeechToTextLanguage} [language] - Optional language code to guide the transcription.
+ * @property {boolean} [verbose] - Optional flag. If set, transcription result is presented with timestamps
+ * and with additional parameters. For more details please refer to `TranscriptionResult`.
  */
 export interface DecodingOptions {
   language?: SpeechToTextLanguage;
   verbose?: boolean;
 }
 
+/**
+ * Structure that represent single token with timestamp information.
+ *
+ * @category Types
+ * @property {string} [word] - Token as a string value.
+ * @property {number} [start] - Timestamp of the beginning of the token in audio (in seconds).
+ * @property {number} [end] - Timestamp of the end of the token in audio (in seconds).
+ */
 export interface Word {
   word: string;
   start: number;
   end: number;
 }
 
+/**
+ * Structure that represent single Segment of transcription.
+ *
+ * @category Types
+ * @property {number} [start] - Timestamp of the beginning of the segment in audio (in seconds).
+ * @property {number} [end] - Timestamp of the end of the segment in audio (in seconds).
+ * @property {string} [text] - Full text of the given segment as a string.
+ * @property {Word[]} [words] - If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
+ * as an array of `Word`.
+ * @property {number[]} [tokens] - Raw tokens represented as table of integers.
+ * @property {number} [temperature] - Temperature for which given segment was computed.
+ * @property {number} [avg_logprob] - Average log probability calculated across all tokens in a segment.
+ * @property {number} [compression_ratio] - Compresion ration achieved on a given segment.
+ * @property {number} [no_speech_prob] - No speech probability, the probability that segment contains silence,
+ * background noise etc.
+ */
 export interface TranscriptionSegment {
-  id: number;
-  seek: number;
   start: number;
   end: number;
   text: string;
@@ -218,6 +239,17 @@ export interface TranscriptionSegment {
   no_speech_prob: number;
 }
 
+/**
+ * Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
+ *
+ * @category Types
+ * @property {string} [task] - String indicating task, either 'transcribe' or 'stream'.
+ * @property {string} [language] - Langauge chosen for transcription.
+ * @property {number} [duration] - Duration in seconds of a given transcription.
+ * @property {string} [text] - The whole text of a transcription as a `string`.
+ * @property {TranscriptionSegment[]} [segments] - If `verbose` set to `true` in `DecodingOptions`, it contains array of
+ * `TranscriptionSegment` with details split into separate transcription segments.
+ */
 export interface TranscriptionResult {
   task?: string;
   language: string;

From 1d0bff90da053b1fd6b13aa2f650de05d3445763 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 18:36:59 +0100
Subject: [PATCH 26/49] docs: Add generated documentation for changed code

---
 .cspell-wordlist.txt                          |  2 +
 .../classes/SpeechToTextModule.md             | 26 ++---
 .../functions/useSpeechToText.md              |  2 +-
 .../interfaces/SpeechToTextModelConfig.md     | 10 +-
 .../interfaces/SpeechToTextProps.md           |  6 +-
 .../interfaces/SpeechToTextType.md            | 60 ++++--------
 .../interfaces/TranscriptionResult.md         | 56 +++++++++++
 .../interfaces/TranscriptionSegment.md        | 97 +++++++++++++++++++
 docs/docs/06-api-reference/interfaces/Word.md | 35 +++++++
 .../type-aliases/SpeechToTextLanguage.md      |  2 +-
 10 files changed, 233 insertions(+), 63 deletions(-)
 create mode 100644 docs/docs/06-api-reference/interfaces/TranscriptionResult.md
 create mode 100644 docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
 create mode 100644 docs/docs/06-api-reference/interfaces/Word.md

diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 2e5092801..283af1481 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -104,3 +104,5 @@ POTTEDPLANT
 TVMONITOR
 sublist
 TTFT
+timestamping
+logprob
diff --git a/docs/docs/06-api-reference/classes/SpeechToTextModule.md b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
index e55f32e0f..b3501da84 100644
--- a/docs/docs/06-api-reference/classes/SpeechToTextModule.md
+++ b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
@@ -1,6 +1,6 @@
 # Class: SpeechToTextModule
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:11](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L11)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:15](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L15)
 
 Module for Speech to Text (STT) functionalities.
 
@@ -20,7 +20,7 @@ Module for Speech to Text (STT) functionalities.
 
 > **decode**(`tokens`, `encoderOutput`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:87](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L87)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:85](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L85)
 
 Runs the decoder of the model.
 
@@ -50,7 +50,7 @@ Decoded output.
 
 > **delete**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:65](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L65)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:63](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L63)
 
 Unloads the model from memory.
 
@@ -64,7 +64,7 @@ Unloads the model from memory.
 
 > **encode**(`waveform`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:76](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L76)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:74](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L74)
 
 Runs the encoding part of the model on the provided waveform.
 Returns the encoded waveform as a Float32Array.
@@ -89,7 +89,7 @@ The encoded output.
 
 > **load**(`model`, `onDownloadProgressCallback`): `Promise`\<`void`\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:28](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L28)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:26](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L26)
 
 Loads the model specified by the config object.
 `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
@@ -116,9 +116,9 @@ Optional callback to monitor download progress.
 
 ### stream()
 
-> **stream**(`options`): `AsyncGenerator`\<\{ `committed`: `string`; `nonCommitted`: `string`; \}\>
+> **stream**(`options`): `AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); \}\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:129](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L129)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:127](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L127)
 
 Starts a streaming transcription session.
 Yields objects with `committed` and `nonCommitted` transcriptions.
@@ -138,7 +138,7 @@ Decoding options including language.
 
 #### Returns
 
-`AsyncGenerator`\<\{ `committed`: `string`; `nonCommitted`: `string`; \}\>
+`AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); \}\>
 
 An async generator yielding transcription updates.
 
@@ -148,7 +148,7 @@ An async generator yielding transcription updates.
 
 > **streamInsert**(`waveform`): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:189](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L189)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:200](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L200)
 
 Inserts a new audio chunk into the streaming transcription session.
 
@@ -170,7 +170,7 @@ The audio chunk to insert.
 
 > **streamStop**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:196](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L196)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:207](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L207)
 
 Stops the current streaming transcription session.
 
@@ -182,9 +182,9 @@ Stops the current streaming transcription session.
 
 ### transcribe()
 
-> **transcribe**(`waveform`, `options`): `Promise`\<`string`\>
+> **transcribe**(`waveform`, `options`): `Promise`\<[`TranscriptionResult`](../interfaces/TranscriptionResult.md)\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:105](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L105)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:103](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L103)
 
 Starts a transcription process for a given input array (16kHz waveform).
 For multilingual models, specify the language in `options`.
@@ -206,6 +206,6 @@ Decoding options including language.
 
 #### Returns
 
-`Promise`\<`string`\>
+`Promise`\<[`TranscriptionResult`](../interfaces/TranscriptionResult.md)\>
 
 The transcription string.
diff --git a/docs/docs/06-api-reference/functions/useSpeechToText.md b/docs/docs/06-api-reference/functions/useSpeechToText.md
index 45d52f72f..1857235f6 100644
--- a/docs/docs/06-api-reference/functions/useSpeechToText.md
+++ b/docs/docs/06-api-reference/functions/useSpeechToText.md
@@ -2,7 +2,7 @@
 
 > **useSpeechToText**(`speechToTextProps`): [`SpeechToTextType`](../interfaces/SpeechToTextType.md)
 
-Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:18](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L18)
+Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:19](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L19)
 
 React hook for managing a Speech to Text (STT) instance.
 
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md b/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
index cae802077..5f55e2276 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
@@ -1,6 +1,6 @@
 # Interface: SpeechToTextModelConfig
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:205](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L205)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:266](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L266)
 
 Configuration for Speech to Text model.
 
@@ -10,7 +10,7 @@ Configuration for Speech to Text model.
 
 > **decoderSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:219](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L219)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:280](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L280)
 
 A string that specifies the location of a `.pte` file for the decoder.
 
@@ -20,7 +20,7 @@ A string that specifies the location of a `.pte` file for the decoder.
 
 > **encoderSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:214](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L214)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:275](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L275)
 
 A string that specifies the location of a `.pte` file for the encoder.
 
@@ -30,7 +30,7 @@ A string that specifies the location of a `.pte` file for the encoder.
 
 > **isMultilingual**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:209](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L209)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:270](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L270)
 
 A boolean flag indicating whether the model supports multiple languages.
 
@@ -40,6 +40,6 @@ A boolean flag indicating whether the model supports multiple languages.
 
 > **tokenizerSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:224](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L224)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:285](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L285)
 
 A string that specifies the location to the tokenizer for the model.
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md b/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
index 8d79cd419..9ef282a9f 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
@@ -1,6 +1,6 @@
 # Interface: SpeechToTextProps
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:9](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L9)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:9](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L9)
 
 Configuration for Speech to Text model.
 
@@ -10,7 +10,7 @@ Configuration for Speech to Text model.
 
 > **model**: [`SpeechToTextModelConfig`](SpeechToTextModelConfig.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:13](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L13)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:13](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L13)
 
 Configuration object containing model sources.
 
@@ -20,6 +20,6 @@ Configuration object containing model sources.
 
 > `optional` **preventLoad**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:17](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L17)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:17](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L17)
 
 Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextType.md b/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
index 79b73e2db..ae6ee03fd 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
@@ -1,27 +1,16 @@
 # Interface: SpeechToTextType
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:25](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L25)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:25](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L25)
 
 React hook for managing Speech to Text (STT) instance.
 
 ## Properties
 
-### committedTranscription
-
-> **committedTranscription**: `string`
-
-Defined in: [packages/react-native-executorch/src/types/stt.ts:50](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L50)
-
-Contains the part of the transcription that is finalized and will not change.
-Useful for displaying stable results during streaming.
-
----
-
 ### downloadProgress
 
 > **downloadProgress**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:44](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L44)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:44](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L44)
 
 Tracks the progress of the model download process.
 
@@ -31,7 +20,7 @@ Tracks the progress of the model download process.
 
 > **error**: [`RnExecutorchError`](../classes/RnExecutorchError.md) \| `null`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:29](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L29)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:29](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L29)
 
 Contains the error message if the model failed to load.
 
@@ -41,7 +30,7 @@ Contains the error message if the model failed to load.
 
 > **isGenerating**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:39](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L39)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:39](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L39)
 
 Indicates whether the model is currently processing an inference.
 
@@ -51,28 +40,17 @@ Indicates whether the model is currently processing an inference.
 
 > **isReady**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:34](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L34)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:34](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L34)
 
 Indicates whether the model has successfully loaded and is ready for inference.
 
----
-
-### nonCommittedTranscription
-
-> **nonCommittedTranscription**: `string`
-
-Defined in: [packages/react-native-executorch/src/types/stt.ts:56](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L56)
-
-Contains the part of the transcription that is still being processed and may change.
-Useful for displaying live, partial results during streaming.
-
 ## Methods
 
 ### decode()
 
 > **decode**(`tokens`, `encoderOutput`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:71](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L71)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:59](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L59)
 
 Runs the decoder of the model.
 
@@ -102,7 +80,7 @@ A promise resolving to the decoded text.
 
 > **encode**(`waveform`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:63](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L63)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:51](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L51)
 
 Runs the encoding part of the model on the provided waveform.
 
@@ -124,9 +102,9 @@ A promise resolving to the encoded data.
 
 ### stream()
 
-> **stream**(`options?`): `Promise`\<`string`\>
+> **stream**(`options?`): `AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](TranscriptionResult.md); \}, `void`, `unknown`\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:94](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L94)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:84](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L84)
 
 Starts a streaming transcription process.
 Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
@@ -142,9 +120,10 @@ Decoding options including language.
 
 #### Returns
 
-`Promise`\<`string`\>
+`AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](TranscriptionResult.md); \}, `void`, `unknown`\>
 
-The final transcription string.
+Asynchronous generator that returns `committed` and `nonCommitted` transcription.
+Both `committed` and `nonCommitted` are of type `TranscriptionResult`
 
 ---
 
@@ -152,7 +131,7 @@ The final transcription string.
 
 > **streamInsert**(`waveform`): `void`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:100](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L100)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:97](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L97)
 
 Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
 
@@ -174,7 +153,7 @@ The audio chunk to insert.
 
 > **streamStop**(): `void`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:105](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L105)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:102](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L102)
 
 Stops the ongoing streaming transcription process.
 
@@ -186,9 +165,9 @@ Stops the ongoing streaming transcription process.
 
 ### transcribe()
 
-> **transcribe**(`waveform`, `options?`): `Promise`\<`string`\>
+> **transcribe**(`waveform`, `options?`): `Promise`\<[`TranscriptionResult`](TranscriptionResult.md)\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:82](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L82)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:71](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L71)
 
 Starts a transcription process for a given input array, which should be a waveform at 16kHz.
 
@@ -204,10 +183,11 @@ The input audio waveform.
 
 [`DecodingOptions`](DecodingOptions.md)
 
-Decoding options, e.g. `{ language: 'es' }` for multilingual models.
+Decoding options, check API reference for more details.
 
 #### Returns
 
-`Promise`\<`string`\>
+`Promise`\<[`TranscriptionResult`](TranscriptionResult.md)\>
 
-Resolves a promise with the output transcription when the model is finished.
+Resolves a promise with the output transcription. Result of transcription is
+object of type `TranscriptionResult`.
diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionResult.md b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
new file mode 100644
index 000000000..64138435a
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
@@ -0,0 +1,56 @@
+# Interface: TranscriptionResult
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:253](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L253)
+
+Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
+
+## Properties
+
+### duration
+
+> **duration**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:256](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L256)
+
+Duration in seconds of a given transcription.
+
+---
+
+### language
+
+> **language**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:255](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L255)
+
+Language chosen for transcription.
+
+---
+
+### segments?
+
+> `optional` **segments**: [`TranscriptionSegment`](TranscriptionSegment.md)[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:258](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L258)
+
+If `verbose` set to `true` in `DecodingOptions`, it contains array of
+`TranscriptionSegment` with details split into separate transcription segments.
+
+---
+
+### task?
+
+> `optional` **task**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:254](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L254)
+
+String indicating task, either 'transcribe' or 'stream'.
+
+---
+
+### text
+
+> **text**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:257](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L257)
+
+The whole text of a transcription as a `string`.
diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
new file mode 100644
index 000000000..885782d92
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
@@ -0,0 +1,97 @@
+# Interface: TranscriptionSegment
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L230)
+
+Structure that represent single Segment of transcription.
+
+## Properties
+
+### avg_logprob
+
+> **avg_logprob**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:237](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L237)
+
+Average log probability calculated across all tokens in a segment.
+
+---
+
+### compression_ratio
+
+> **compression_ratio**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:238](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L238)
+
+Compression ration achieved on a given segment.
+
+---
+
+### end
+
+> **end**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L232)
+
+Timestamp of the end of the segment in audio (in seconds).
+
+---
+
+### no_speech_prob
+
+> **no_speech_prob**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:239](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L239)
+
+No speech probability, the probability that segment contains silence,
+background noise etc.
+
+---
+
+### start
+
+> **start**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L231)
+
+Timestamp of the beginning of the segment in audio (in seconds).
+
+---
+
+### temperature
+
+> **temperature**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L236)
+
+Temperature for which given segment was computed.
+
+---
+
+### text
+
+> **text**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L233)
+
+Full text of the given segment as a string.
+
+---
+
+### tokens
+
+> **tokens**: `number`[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L235)
+
+Raw tokens represented as table of integers.
+
+---
+
+### words?
+
+> `optional` **words**: [`Word`](Word.md)[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L234)
+
+If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
+as an array of `Word`.
diff --git a/docs/docs/06-api-reference/interfaces/Word.md b/docs/docs/06-api-reference/interfaces/Word.md
new file mode 100644
index 000000000..9ffe5fcd1
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/Word.md
@@ -0,0 +1,35 @@
+# Interface: Word
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:208](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L208)
+
+Structure that represent single token with timestamp information.
+
+## Properties
+
+### end
+
+> **end**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:211](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L211)
+
+Timestamp of the end of the token in audio (in seconds).
+
+---
+
+### start
+
+> **start**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:210](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L210)
+
+Timestamp of the beginning of the token in audio (in seconds).
+
+---
+
+### word
+
+> **word**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:209](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L209)
+
+Token as a string value.
diff --git a/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md b/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
index b60e8c82e..0caedbc77 100644
--- a/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
+++ b/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
@@ -2,6 +2,6 @@
 
 > **SpeechToTextLanguage** = `"af"` \| `"sq"` \| `"ar"` \| `"hy"` \| `"az"` \| `"eu"` \| `"be"` \| `"bn"` \| `"bs"` \| `"bg"` \| `"my"` \| `"ca"` \| `"zh"` \| `"hr"` \| `"cs"` \| `"da"` \| `"nl"` \| `"et"` \| `"en"` \| `"fi"` \| `"fr"` \| `"gl"` \| `"ka"` \| `"de"` \| `"el"` \| `"gu"` \| `"ht"` \| `"he"` \| `"hi"` \| `"hu"` \| `"is"` \| `"id"` \| `"it"` \| `"ja"` \| `"kn"` \| `"kk"` \| `"km"` \| `"ko"` \| `"lo"` \| `"lv"` \| `"lt"` \| `"mk"` \| `"mg"` \| `"ms"` \| `"ml"` \| `"mt"` \| `"mr"` \| `"ne"` \| `"no"` \| `"fa"` \| `"pl"` \| `"pt"` \| `"pa"` \| `"ro"` \| `"ru"` \| `"sr"` \| `"si"` \| `"sk"` \| `"sl"` \| `"es"` \| `"su"` \| `"sw"` \| `"sv"` \| `"tl"` \| `"tg"` \| `"ta"` \| `"te"` \| `"th"` \| `"tr"` \| `"uk"` \| `"ur"` \| `"uz"` \| `"vi"` \| `"cy"` \| `"yi"`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:113](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L113)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:110](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L110)
 
 Languages supported by whisper (not whisper.en)

From 99c59b6ff40ec2d30049ffdb77d318cc05e416d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 18:41:10 +0100
Subject: [PATCH 27/49] chore: Add update types for stt

---
 packages/react-native-executorch/src/types/stt.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index ad7d3fd49..b50f573cc 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -223,7 +223,7 @@ export interface Word {
  * @property {number[]} [tokens] - Raw tokens represented as table of integers.
  * @property {number} [temperature] - Temperature for which given segment was computed.
  * @property {number} [avg_logprob] - Average log probability calculated across all tokens in a segment.
- * @property {number} [compression_ratio] - Compresion ration achieved on a given segment.
+ * @property {number} [compression_ratio] - Compression ration achieved on a given segment.
  * @property {number} [no_speech_prob] - No speech probability, the probability that segment contains silence,
  * background noise etc.
  */
@@ -244,7 +244,7 @@ export interface TranscriptionSegment {
  *
  * @category Types
  * @property {string} [task] - String indicating task, either 'transcribe' or 'stream'.
- * @property {string} [language] - Langauge chosen for transcription.
+ * @property {string} [language] - Language chosen for transcription.
  * @property {number} [duration] - Duration in seconds of a given transcription.
  * @property {string} [text] - The whole text of a transcription as a `string`.
  * @property {TranscriptionSegment[]} [segments] - If `verbose` set to `true` in `DecodingOptions`, it contains array of

From 18357223ee089fc729ec86dab4ceed7fbaa6cde7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 20:15:16 +0100
Subject: [PATCH 28/49] chore: fix small details

---
 apps/speech/components/VerboseTranscription.tsx | 17 +++++++++++++----
 apps/speech/screens/SpeechToTextScreen.tsx      | 10 +++++++---
 .../models/speech_to_text/SpeechToText.cpp      |  6 ++++++
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index fd608dfa5..1b23dc4ec 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -75,25 +75,34 @@ export const VerboseTranscription = ({
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Avg LogProb</Text>
                   <Text style={styles.statValue}>
-                    {seg.avg_logprob?.toFixed(4)}
+                    {data.task == 'transcribe'
+                      ? seg.avg_logprob?.toFixed(4)
+                      : 'N/A'}
                   </Text>
                 </View>
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>No Speech</Text>
                   <Text style={styles.statValue}>
-                    {seg.no_speech_prob?.toFixed(4)}
+                    {data.task == 'transcribe'
+                      ? seg.no_speech_prob?.toFixed(4)
+                      : 'N/A'}
                   </Text>
                 </View>
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Temp</Text>
                   <Text style={styles.statValue}>
-                    {seg.temperature?.toFixed(2)}
+                    {data.task == 'transcribe'
+                      ? seg.temperature?.toFixed(2)
+                      : 'N/A'}
                   </Text>
                 </View>
                 <View style={styles.statItem}>
+                  {/*eslint-disable-next-line @cspell/spellchecker*/}
                   <Text style={styles.statLabel}>Compr.</Text>
                   <Text style={styles.statValue}>
-                    {seg.compression_ratio?.toFixed(2)}
+                    {data.task == 'transcribe'
+                      ? seg.compression_ratio?.toFixed(2)
+                      : 'N/A'}
                   </Text>
                 </View>
               </View>
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 004cd85fe..3bae0737e 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -13,7 +13,7 @@ import {
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
-  WHISPER_TINY_EN,
+  WHISPER_TINY,
   TranscriptionResult,
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
@@ -32,7 +32,7 @@ const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const model = useSpeechToText({
-    model: WHISPER_TINY_EN,
+    model: WHISPER_TINY,
   });
 
   const [transcription, setTranscription] =
@@ -98,6 +98,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const audioBuffer = decodedAudioData.getChannelData(0);
       const result = await model.transcribe(audioBuffer, {
         verbose: enableTimestamps,
+        language: 'pl',
       });
       setTranscription(result);
     } catch (error) {
@@ -139,7 +140,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     let accumulatedSegments: any[] = [];
 
     try {
-      const streamIter = model.stream({ verbose: enableTimestamps });
+      const streamIter = model.stream({
+        verbose: enableTimestamps,
+        language: 'pl',
+      });
 
       for await (const { committed, nonCommitted } of streamIter) {
         if (!isRecordingRef.current) break;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 3fe2634e1..0886d2f5b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -58,6 +58,7 @@ TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
 
   TranscriptionResult result;
   result.text = fullText;
+  result.task = "transcription";
 
   if (verbose) {
     result.language = languageOption.empty() ? "english" : languageOption;
@@ -78,6 +79,7 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
                                   const std::string &language, bool verbose) {
   TranscriptionResult res;
   res.language = language;
+  res.task = "stream";
 
   std::string fullText;
   for (const auto &w : words) {
@@ -90,6 +92,10 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
     seg.start = words.front().start;
     seg.end = words.back().end;
     seg.words = words;
+    seg.avgLogprob = std::nanf("0");
+    seg.compressionRatio = std::nanf("0");
+    seg.noSpeechProbability = std::nanf("0");
+    seg.temperature = std::nanf("0");
 
     res.segments.push_back(std::move(seg));
   }

From 12b18fe58df2b55c701c84e91a10058accee850c Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:26:45 +0100
Subject: [PATCH 29/49] Apply suggestion from @msluszniak

---
 apps/speech/components/VerboseTranscription.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 1b23dc4ec..d44ad39ca 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -75,7 +75,7 @@ export const VerboseTranscription = ({
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Avg LogProb</Text>
                   <Text style={styles.statValue}>
-                    {data.task == 'transcribe'
+                    {data.task === 'transcribe'
                       ? seg.avg_logprob?.toFixed(4)
                       : 'N/A'}
                   </Text>

From 190c9c2d6b71cf9fe5811220149b64239cc50bbd Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:26:54 +0100
Subject: [PATCH 30/49] Apply suggestion from @msluszniak

---
 apps/speech/components/VerboseTranscription.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index d44ad39ca..b3df0974e 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -83,7 +83,7 @@ export const VerboseTranscription = ({
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>No Speech</Text>
                   <Text style={styles.statValue}>
-                    {data.task == 'transcribe'
+                    {data.task === 'transcribe'
                       ? seg.no_speech_prob?.toFixed(4)
                       : 'N/A'}
                   </Text>

From 560c48586bf519ba495d09565a392305fec3f3e6 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:27:03 +0100
Subject: [PATCH 31/49] Apply suggestion from @msluszniak

---
 apps/speech/components/VerboseTranscription.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index b3df0974e..5486ddbbf 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -91,7 +91,7 @@ export const VerboseTranscription = ({
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Temp</Text>
                   <Text style={styles.statValue}>
-                    {data.task == 'transcribe'
+                    {data.task === 'transcribe'
                       ? seg.temperature?.toFixed(2)
                       : 'N/A'}
                   </Text>

From 8e49949c5100b71cc725c99d4763cad5cf576fb0 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:27:12 +0100
Subject: [PATCH 32/49] Apply suggestion from @msluszniak

---
 apps/speech/components/VerboseTranscription.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 5486ddbbf..7c7f76d33 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -100,7 +100,7 @@ export const VerboseTranscription = ({
                   {/*eslint-disable-next-line @cspell/spellchecker*/}
                   <Text style={styles.statLabel}>Compr.</Text>
                   <Text style={styles.statValue}>
-                    {data.task == 'transcribe'
+                    {data.task === 'transcribe'
                       ? seg.compression_ratio?.toFixed(2)
                       : 'N/A'}
                   </Text>

From 553efb83376eacd36e9c2be7e0700a4ad91e19f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Fri, 6 Feb 2026 21:36:21 +0100
Subject: [PATCH 33/49] Update documentation for the current implementation

---
 .../useSpeechToText.md                        | 165 ++++++++++++------
 .../SpeechToTextModule.md                     | 134 +++++++++-----
 .../classes/SpeechToTextModule.md             |  18 +-
 .../functions/useSpeechToText.md              |   2 +-
 4 files changed, 216 insertions(+), 103 deletions(-)

diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 85c049f9c..464c7ffc8 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -47,12 +47,12 @@ const { uri } = await FileSystem.downloadAsync(
 );
 
 const audioContext = new AudioContext({ sampleRate: 16000 });
-const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+const decodedAudioData = await audioContext.decodeAudioData(uri);
 const audioBuffer = decodedAudioData.getChannelData(0);
 
 try {
   const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription);
+  console.log(transcription.text);
 } catch (error) {
   console.error('Error during audio transcription', error);
 }
@@ -83,7 +83,6 @@ Please note, that both [`transcribe`](../../06-api-reference/interfaces/SpeechTo
 
 To get more details please read: [`SpeechToTextType` API Reference](../../06-api-reference/interfaces/SpeechToTextType.md).
 
-
 ## Running the model
 
 Before running the model's [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) method, make sure to extract the audio waveform you want to transcribe. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the transcribe method. The method returns a promise that resolves to the generated transcription on success, or an error if inference fails.
@@ -102,13 +101,41 @@ const model = useSpeechToText({
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
-### Timestamps
+### Timestamps & Transcription Stat Data
 
-You can obtain word-level timestamps by setting `enableTimestamps: true` in the options. This changes the return type from a string to an array of `Word` objects.
+You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
 
 ```typescript
-const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
-// words: [{ word: "Hello", start: 0.0, end: 0.4 }, ...]
+const transcription = await model.transcribe(audioBuffer, { verbose: true });
+// Example result
+//
+// transcription: {
+//   task: "transcription",
+//   text: "Example text for a ...",
+//   duration: 9.05,
+//   language: "en",
+//   segments: [
+//     {
+//       start: 0;
+//       end: 5.4;
+//       text: "Example text for";
+//       words: [
+//         {
+//            word: "Example"
+//            start: 0,
+//            end: 1.4,
+//         },
+//         ...
+//       ]
+//       tokens: [1, 32, 45, ...]
+//       temperature: 0.0
+//       avg_logprob: -1.235
+//       compression_ratio: 1.632
+//       no_speech_prob: 0.04
+//     },
+//     ...
+//   ]
+// }
 ```
 
 ## Example
@@ -119,7 +146,7 @@ import { Button, Text, View } from 'react-native';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  Word,
+  TranscriptionResult,
 } from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
@@ -129,7 +156,7 @@ function App() {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState<string | Word[]>('');
+  const [transcription, setTranscription] = useState<TranscriptionResult>(null);
 
   const loadAudio = async () => {
     const { uri } = await FileSystem.downloadAsync(
@@ -154,19 +181,26 @@ function App() {
   const handleTranscribeWithTimestamps = async () => {
     const audio = await loadAudio();
     // Transcription with timestamps
-    const result = await model.transcribe(audio, { enableTimestamps: true });
+    const result = await model.transcribe(audio, { verbose: true });
     setTranscription(result);
   };
 
+  // Custom logic for printing transcription
+  // e.g.
+
   const renderContent = () => {
-    if (typeof transcription === 'string') {
-      return <Text>{transcription}</Text>;
+    if (!transcription) return <Text>Press a button to transcribe</Text>;
+
+    if (transcription.segments && transcription.segments.length > 0) {
+      return (
+        <Text>
+          {transcription.text +
+            '\n\nNum segments: ' +
+            transcription.segments.length.toString()}
+        </Text>
+      );
     }
-    return transcription.map((w, i) => (
-      <Text key={i}>
-        {w.word} ({w.start.toFixed(2)}s)
-      </Text>
-    ));
+    return <Text>{transcription.text}</Text>;
   };
 
   return (
@@ -185,24 +219,21 @@ function App() {
 ### Streaming transcription
 
 ```tsx
-import React, { useEffect, useState } from 'react';
-import { Text, Button, View } from 'react-native';
+import React, { useEffect, useState, useRef } from 'react';
+import { Text, Button, View, SafeAreaView } from 'react-native';
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
 
-function App() {
+export default function App() {
   const model = useSpeechToText({
     model: WHISPER_TINY_EN,
   });
 
-  const [recorder] = useState(
-    () =>
-      new AudioRecorder({
-        sampleRate: 16000,
-        bufferLengthInSamples: 1600,
-      })
-  );
+  const [transcribedText, setTranscribedText] = useState('');
+
+  const isRecordingRef = useRef(false);
+
+  const [recorder] = useState(() => new AudioRecorder());
 
   useEffect(() => {
     AudioManager.setAudioSessionOptions({
@@ -214,42 +245,74 @@ function App() {
   }, []);
 
   const handleStartStreamingTranscribe = async () => {
-    recorder.onAudioReady(({ buffer }) => {
-      model.streamInsert(buffer.getChannelData(0));
-    });
-    recorder.start();
+    isRecordingRef.current = true;
+    setTranscribedText('');
+
+    const sampleRate = 16000;
+
+    recorder.onAudioReady(
+      {
+        sampleRate,
+        bufferLength: 0.1 * sampleRate,
+        channelCount: 1,
+      },
+      (chunk) => {
+        model.streamInsert(chunk.buffer.getChannelData(0));
+      }
+    );
+
+    try {
+      await recorder.start();
+    } catch (e) {
+      console.error('Recorder failed:', e);
+      return;
+    }
 
     try {
-      // Pass { enableTimestamps: true } here if you want Word[] updates
-      await model.stream();
+      let accumulatedCommitted = '';
+
+      const streamIter = model.stream({ verbose: false });
+
+      for await (const { committed, nonCommitted } of streamIter) {
+        if (!isRecordingRef.current) break;
+
+        if (committed.text) {
+          accumulatedCommitted += committed.text;
+        }
+
+        setTranscribedText(accumulatedCommitted + nonCommitted.text);
+      }
     } catch (error) {
       console.error('Error during streaming transcription:', error);
     }
   };
 
   const handleStopStreamingTranscribe = () => {
+    isRecordingRef.current = false;
     recorder.stop();
     model.streamStop();
   };
 
-  // Helper to safely render mixed types
-  const getText = (data: string | any[]) => {
-    if (typeof data === 'string') return data;
-    return data.map((w) => w.word).join('');
-  };
-
   return (
-    <View>
-      <Text>
-        {getText(model.committedTranscription)}
-        {getText(model.nonCommittedTranscription)}
-      </Text>
-      <Button
-        onPress={handleStartStreamingTranscribe}
-        title="Start Streaming"
-      />
-      <Button onPress={handleStopStreamingTranscribe} title="Stop Streaming" />
-    </View>
+    <SafeAreaView>
+      <View style={{ padding: 20 }}>
+        <Text style={{ marginBottom: 20, fontSize: 18 }}>
+          {transcribedText || 'Press start to speak...'}
+        </Text>
+
+        <Button
+          onPress={handleStartStreamingTranscribe}
+          title="Start Streaming"
+          disabled={model.isGenerating}
+        />
+        <View style={{ height: 10 }} />
+        <Button
+          onPress={handleStopStreamingTranscribe}
+          title="Stop Streaming"
+          color="red"
+        />
+      </View>
+    </SafeAreaView>
   );
 }
 ```
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index 34d9633d4..cc84eb5e8 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -76,13 +76,41 @@ await model.load(WHISPER_TINY, (progress) => {
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
-### Timestamps
+### Timestamps & Transcription Stat Data
 
-To get word-level timestamps, set `enableTimestamps` to `true`.
+You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) and [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe), [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
 
 ```typescript
-const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
-// words: [{ word: "Hello", start: 0.0, end: 0.5 }, ...]
+const transcription = await model.transcribe(audioBuffer, { verbose: true });
+// Example result
+//
+// transcription: {
+//   task: "transcription",
+//   text: "Example text for a ...",
+//   duration: 9.05,
+//   language: "en",
+//   segments: [
+//     {
+//       start: 0;
+//       end: 5.4;
+//       text: "Example text for";
+//       words: [
+//         {
+//            word: "Example"
+//            start: 0,
+//            end: 1.4,
+//         },
+//         ...
+//       ]
+//       tokens: [1, 32, 45, ...]
+//       temperature: 0.0
+//       avg_logprob: -1.235
+//       compression_ratio: 1.632
+//       no_speech_prob: 0.04
+//     },
+//     ...
+//   ]
+// }
 ```
 
 ## Example
@@ -94,32 +122,41 @@ import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
-// Load the model
-const model = new SpeechToTextModule();
-
-// Download the audio file
-const { uri } = await FileSystem.downloadAsync(
-  'https://some-audio-url.com/file.mp3',
-  FileSystem.cacheDirectory + 'audio_file'
-);
-
-// Decode the audio data
-const audioContext = new AudioContext({ sampleRate: 16000 });
-const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-const audioBuffer = decodedAudioData.getChannelData(0);
-
-// Transcribe the audio
-try {
-  // Option 1: Text only
-  const text = await model.transcribe(audioBuffer);
-  console.log('Text:', text);
-
-  // Option 2: With timestamps
-  const words = await model.transcribe(audioBuffer, { enableTimestamps: true });
-  console.log('Words:', words);
-} catch (error) {
-  console.error('Error during audio transcription', error);
-}
+const transcribeAudio = async () => {
+  // Initialize with the model config
+  const model = new SpeechToTextModule();
+  await model.load(WHISPER_TINY_EN, (progress) => {
+    console.log(progress);
+  });
+
+  // Download the audio file
+  const { uri } = await FileSystem.downloadAsync(
+    'https://some-audio-url.com/file.mp3',
+    FileSystem.cacheDirectory + 'audio_file'
+  );
+
+  // Decode the audio data (Correct as per your previous code)
+  const audioContext = new AudioContext({ sampleRate: 16000 });
+  const decodedAudioData = await audioContext.decodeAudioData(uri);
+  const audioBuffer = decodedAudioData.getChannelData(0);
+
+  // Transcribe the audio
+  try {
+    // Option 1: Text only
+    const resultText = await model.transcribe(audioBuffer);
+    console.log('Text:', resultText.text); // .text is the standard property now
+
+    // Option 2: With timestamps (Use 'verbose' instead of 'enableTimestamps')
+    const resultVerbose = await model.transcribe(audioBuffer, {
+      verbose: true,
+    });
+
+    console.log('Full Text:', resultVerbose.text);
+    console.log('Segments:', resultVerbose.segments); // Contains start/end/avg_logprob
+  } catch (error) {
+    console.error('Error during audio transcription', error);
+  }
+};
 ```
 
 ### Streaming Transcription
@@ -140,28 +177,41 @@ AudioManager.setAudioSessionOptions({
   iosMode: 'spokenAudio',
   iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
 });
-AudioManager.requestRecordingPermissions();
+await AudioManager.requestRecordingPermissions();
 
-// Initialize audio recorder
+// Initialize audio recorder with FULL config in constructor
 const recorder = new AudioRecorder({
   sampleRate: 16000,
-  bufferLengthInSamples: 1600,
+  channelCount: 1,
+  bitsPerSample: 16,
+  bufferLengthInSamples: 16000, // e.g. 1 second buffer
 });
-recorder.onAudioReady(({ buffer }) => {
+
+// Pass ONLY the callback to onAudioReady
+recorder.onAudioReady((chunk) => {
   // Insert the audio into the streaming transcription
-  model.streamInsert(buffer.getChannelData(0));
+  model.streamInsert(chunk.buffer.getChannelData(0));
 });
-recorder.start();
+
+await recorder.start();
 
 // Start streaming transcription
 try {
-  let transcription = '';
-  // Note: Pass { enableTimestamps: true } here to get Word[] objects instead
-  for await (const { committed, nonCommitted } of model.stream()) {
-    console.log('Streaming transcription:', { committed, nonCommitted });
-    transcription += committed;
+  let finalTranscription = '';
+
+  // Use 'verbose' flag for timestamps/segments
+  const streamIter = model.stream({ verbose: true });
+
+  for await (const { committed, nonCommitted } of streamIter) {
+    // Note: committed/nonCommitted are objects { text, segments } now
+    console.log('Committed Text:', committed.text);
+    console.log('Live Text:', nonCommitted.text);
+
+    if (committed.text) {
+      finalTranscription += committed.text;
+    }
   }
-  console.log('Final transcription:', transcription);
+  console.log('Final transcription:', finalTranscription);
 } catch (error) {
   console.error('Error during streaming transcription:', error);
 }
diff --git a/docs/docs/06-api-reference/classes/SpeechToTextModule.md b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
index b3501da84..27e019017 100644
--- a/docs/docs/06-api-reference/classes/SpeechToTextModule.md
+++ b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
@@ -1,6 +1,6 @@
 # Class: SpeechToTextModule
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:15](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L15)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:15](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L15)
 
 Module for Speech to Text (STT) functionalities.
 
@@ -20,7 +20,7 @@ Module for Speech to Text (STT) functionalities.
 
 > **decode**(`tokens`, `encoderOutput`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:85](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L85)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:85](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L85)
 
 Runs the decoder of the model.
 
@@ -50,7 +50,7 @@ Decoded output.
 
 > **delete**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:63](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L63)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:63](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L63)
 
 Unloads the model from memory.
 
@@ -64,7 +64,7 @@ Unloads the model from memory.
 
 > **encode**(`waveform`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:74](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L74)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:74](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L74)
 
 Runs the encoding part of the model on the provided waveform.
 Returns the encoded waveform as a Float32Array.
@@ -89,7 +89,7 @@ The encoded output.
 
 > **load**(`model`, `onDownloadProgressCallback`): `Promise`\<`void`\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:26](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L26)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:26](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L26)
 
 Loads the model specified by the config object.
 `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
@@ -118,7 +118,7 @@ Optional callback to monitor download progress.
 
 > **stream**(`options`): `AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); \}\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:127](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L127)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:127](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L127)
 
 Starts a streaming transcription session.
 Yields objects with `committed` and `nonCommitted` transcriptions.
@@ -148,7 +148,7 @@ An async generator yielding transcription updates.
 
 > **streamInsert**(`waveform`): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:200](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L200)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:200](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L200)
 
 Inserts a new audio chunk into the streaming transcription session.
 
@@ -170,7 +170,7 @@ The audio chunk to insert.
 
 > **streamStop**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:207](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L207)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:207](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L207)
 
 Stops the current streaming transcription session.
 
@@ -184,7 +184,7 @@ Stops the current streaming transcription session.
 
 > **transcribe**(`waveform`, `options`): `Promise`\<[`TranscriptionResult`](../interfaces/TranscriptionResult.md)\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:103](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L103)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:103](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L103)
 
 Starts a transcription process for a given input array (16kHz waveform).
 For multilingual models, specify the language in `options`.
diff --git a/docs/docs/06-api-reference/functions/useSpeechToText.md b/docs/docs/06-api-reference/functions/useSpeechToText.md
index 1857235f6..1ed320f41 100644
--- a/docs/docs/06-api-reference/functions/useSpeechToText.md
+++ b/docs/docs/06-api-reference/functions/useSpeechToText.md
@@ -2,7 +2,7 @@
 
 > **useSpeechToText**(`speechToTextProps`): [`SpeechToTextType`](../interfaces/SpeechToTextType.md)
 
-Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:19](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L19)
+Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:19](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L19)
 
 React hook for managing a Speech to Text (STT) instance.
 

From 4840e99da3ffe56ee9950f0611503cbd5338c221 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Mon, 9 Feb 2026 11:42:44 +0100
Subject: [PATCH 34/49] test: Adapt test to comply with the current
 implementation

---
 .../types/TranscriptionResult.h               |  4 ++--
 .../tests/integration/SpeechToTextTest.cpp    | 20 +++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
index ca9c3e573..4fcf050a4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
@@ -1,13 +1,13 @@
 #pragma once
+#include "Segment.h"
 #include <string>
 #include <vector>
-#include "Segment.h"
 
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct TranscriptionResult {
   std::string text;
-  std::string task = "transcribe";
+  std::string task = "transcription";
   std::string language;
   double duration = 0.0;
   std::vector<Segment> segments; // Populated only if verbose=true
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
index b9a1d884c..5b15c0040 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
@@ -32,7 +32,7 @@ template <> struct ModelTraits<SpeechToText> {
 
   static void callGenerate(ModelType &model) {
     auto audio = test_utils::loadAudioFromFile("test_audio_float.raw");
-    (void)model.transcribe(audio, "en");
+    (void)model.transcribe(audio, "en", false);
   }
 };
 } // namespace model_tests
@@ -53,7 +53,7 @@ TEST(S2TCtorTests, InvalidDecoderPathThrows) {
 TEST(S2TCtorTests, InvalidTokenizerPathThrows) {
   EXPECT_THROW(SpeechToText(kValidEncoderPath, kValidDecoderPath,
                             "nonexistent.json", nullptr),
-               RnExecutorchError);
+               std::filesystem::filesystem_error);
 }
 
 TEST(S2TEncodeTests, EncodeReturnsNonNull) {
@@ -71,9 +71,13 @@ TEST(S2TTranscribeTests, TranscribeReturnsValidChars) {
                      nullptr);
   auto audio = loadAudioFromFile("test_audio_float.raw");
   ASSERT_FALSE(audio.empty());
-  auto result = model.transcribe(audio, "en");
-  ASSERT_FALSE(result.empty());
-  for (char c : result) {
+  auto result = model.transcribe(audio, "en", true);
+  ASSERT_EQ(result.language, "en");
+  EXPECT_GE(result.duration, 20.0f);
+  ASSERT_EQ(result.task, "transcription");
+  ASSERT_FALSE(result.segments.empty());
+  ASSERT_FALSE(result.text.empty());
+  for (char c : result.text) {
     EXPECT_GE(static_cast<unsigned char>(c), 0);
     EXPECT_LE(static_cast<unsigned char>(c), 127);
   }
@@ -83,8 +87,8 @@ TEST(S2TTranscribeTests, EmptyResultOnSilence) {
   SpeechToText model(kValidEncoderPath, kValidDecoderPath, kValidTokenizerPath,
                      nullptr);
   auto audio = generateSilence(16000 * 5);
-  auto result = model.transcribe(audio, "en");
-  EXPECT_TRUE(result.empty());
+  auto result = model.transcribe(audio, "en", false);
+  EXPECT_TRUE(result.text.empty());
 }
 
 TEST(S2TTranscribeTests, InvalidLanguageThrows) {
@@ -92,6 +96,6 @@ TEST(S2TTranscribeTests, InvalidLanguageThrows) {
                      nullptr);
   auto audio = loadAudioFromFile("test_audio_float.raw");
   ASSERT_FALSE(audio.empty());
-  EXPECT_THROW((void)model.transcribe(audio, "invalid_language_code"),
+  EXPECT_THROW((void)model.transcribe(audio, "invalid_language_code", false),
                RnExecutorchError);
 }

From 88f5fecf546dcbda9092a262bd682744ce8466b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Mon, 9 Feb 2026 17:29:45 +0100
Subject: [PATCH 35/49] chore fix example app and make sure that elements
 exists in c++

---
 apps/speech/screens/SpeechToTextScreen.tsx                | 6 ++----
 .../common/rnexecutorch/models/speech_to_text/asr/ASR.cpp | 8 +++++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index 3bae0737e..efaa2686b 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -13,7 +13,7 @@ import {
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
-  WHISPER_TINY,
+  WHISPER_TINY_EN,
   TranscriptionResult,
 } from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
@@ -32,7 +32,7 @@ const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const model = useSpeechToText({
-    model: WHISPER_TINY,
+    model: WHISPER_TINY_EN,
   });
 
   const [transcription, setTranscription] =
@@ -98,7 +98,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
       const audioBuffer = decodedAudioData.getChannelData(0);
       const result = await model.transcribe(audioBuffer, {
         verbose: enableTimestamps,
-        language: 'pl',
       });
       setTranscription(result);
     } catch (error) {
@@ -142,7 +141,6 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const streamIter = model.stream({
         verbose: enableTimestamps,
-        language: 'pl',
       });
 
       for await (const { committed, nonCommitted } of streamIter) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index b394e2b96..607b0816e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -284,7 +284,13 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
       seg.end += seek;
     }
 
-    seek = static_cast<int32_t>(segments.back().words.back().end);
+    while (!segments.empty() && segments.back().words.empty()) {
+      segments.pop_back();
+    }
+
+    if (!segments.empty() && !segments.back().words.empty()) {
+      seek = static_cast<int32_t>(segments.back().words.back().end);
+    }
     results.insert(results.end(), std::make_move_iterator(segments.begin()),
                    std::make_move_iterator(segments.end()));
   }

From 1fb5ac3639996cc87d5467224e66f9fcd922150d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Tue, 10 Feb 2026 17:30:43 +0100
Subject: [PATCH 36/49] fix: update demo app to align with newer version of
 react-native-audio-api

---
 apps/speech/package.json                   |  2 +-
 apps/speech/screens/SpeechToTextScreen.tsx | 13 ++++++++++---
 yarn.lock                                  | 10 +++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/apps/speech/package.json b/apps/speech/package.json
index d601d757a..1c0607be2 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -19,7 +19,7 @@
     "metro-config": "^0.81.0",
     "react": "19.1.0",
     "react-native": "0.81.5",
-    "react-native-audio-api": "0.11.0",
+    "react-native-audio-api": "0.11.3",
     "react-native-device-info": "^14.0.4",
     "react-native-executorch": "workspace:*",
     "react-native-reanimated": "~4.1.1",
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index efaa2686b..f1bd69653 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -50,13 +50,13 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
 
-  const [recorder] = useState(() => new AudioRecorder());
+  const recorder = new AudioRecorder();
 
   useEffect(() => {
     AudioManager.setAudioSessionOptions({
       iosCategory: 'playAndRecord',
       iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
+      iosOptions: ['allowBluetooth'],
     });
     const checkPerms = async () => {
       const granted = await AudioManager.requestRecordingPermissions();
@@ -127,7 +127,14 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     );
 
     try {
-      await recorder.start();
+      const success = await AudioManager.setAudioSessionActivity(true);
+      if (!success) {
+        console.warn('Cannot start audio session correctly');
+      }
+      const result = recorder.start();
+      if (result.status === 'error') {
+        console.warn('Recording problems: ', result.message);
+      }
     } catch (e) {
       console.error('Failed to start recorder', e);
       isRecordingRef.current = false;
diff --git a/yarn.lock b/yarn.lock
index 23a8f6f5a..4e2b99434 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -12847,9 +12847,9 @@ __metadata:
   languageName: node
   linkType: hard
 
-"react-native-audio-api@npm:0.11.0":
-  version: 0.11.0
-  resolution: "react-native-audio-api@npm:0.11.0"
+"react-native-audio-api@npm:0.11.1":
+  version: 0.11.1
+  resolution: "react-native-audio-api@npm:0.11.1"
   dependencies:
     semver: "npm:^7.7.3"
   peerDependencies:
@@ -12857,7 +12857,7 @@ __metadata:
     react-native: "*"
   bin:
     setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
-  checksum: 10/9cb7c9afce098e6b39ba2e8c80c3085b9d21cc0b18bf19643e7376fce3ddf5dd8ec5c02873c0d4d6b3dcd9d9f2588633e2ed9a0c42c2d506980d34a3a02c6ef8
+  checksum: 10/3e062c479ec76015fe068af3b03928b591c11fc6873b030b7756ab6e6eb2a17499801d061002997121fbfb746040d7039266d81196c3ae05ee8b489b07a97047
   languageName: node
   linkType: hard
 
@@ -14100,7 +14100,7 @@ __metadata:
     metro-config: "npm:^0.81.0"
     react: "npm:19.1.0"
     react-native: "npm:0.81.5"
-    react-native-audio-api: "npm:0.11.0"
+    react-native-audio-api: "npm:0.11.1"
     react-native-device-info: "npm:^14.0.4"
     react-native-executorch: "workspace:*"
     react-native-reanimated: "npm:~4.1.1"

From 9503be8361020bb4e88a0858bdbf5357971a0ca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 10:34:50 +0100
Subject: [PATCH 37/49] chore: fix CI

---
 yarn.lock | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/yarn.lock b/yarn.lock
index 4e2b99434..ccc0d534e 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -12847,9 +12847,9 @@ __metadata:
   languageName: node
   linkType: hard
 
-"react-native-audio-api@npm:0.11.1":
-  version: 0.11.1
-  resolution: "react-native-audio-api@npm:0.11.1"
+"react-native-audio-api@npm:0.11.3":
+  version: 0.11.3
+  resolution: "react-native-audio-api@npm:0.11.3"
   dependencies:
     semver: "npm:^7.7.3"
   peerDependencies:
@@ -12857,7 +12857,7 @@ __metadata:
     react-native: "*"
   bin:
     setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
-  checksum: 10/3e062c479ec76015fe068af3b03928b591c11fc6873b030b7756ab6e6eb2a17499801d061002997121fbfb746040d7039266d81196c3ae05ee8b489b07a97047
+  checksum: 10/0c973061a81196f93cb19eecf3c6126deaaaa28ad89c7830cdcfc833faad594fb275b733d12729aa11676e0d571c493006fdef4bf243a79dcb142666e87bcf0b
   languageName: node
   linkType: hard
 
@@ -14100,7 +14100,7 @@ __metadata:
     metro-config: "npm:^0.81.0"
     react: "npm:19.1.0"
     react-native: "npm:0.81.5"
-    react-native-audio-api: "npm:0.11.1"
+    react-native-audio-api: "npm:0.11.3"
     react-native-device-info: "npm:^14.0.4"
     react-native-executorch: "workspace:*"
     react-native-reanimated: "npm:~4.1.1"

From 2363c340361b550dd8f952ced0b143d3fde53237 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 10:45:47 +0100
Subject: [PATCH 38/49] chore: bring back settings

---
 apps/speech/screens/SpeechToTextScreen.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index f1bd69653..06813dfcd 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -56,7 +56,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     AudioManager.setAudioSessionOptions({
       iosCategory: 'playAndRecord',
       iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetooth'],
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
     });
     const checkPerms = async () => {
       const granted = await AudioManager.requestRecordingPermissions();

From 83bc0857a07c741f358dd8bb0232e82ba9f57b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 11:17:42 +0100
Subject: [PATCH 39/49] chore: name tasks in transcription the same as in
 OpenAI API

---
 .../common/rnexecutorch/models/speech_to_text/SpeechToText.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 0886d2f5b..6cf02ac5d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -58,7 +58,7 @@ TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
 
   TranscriptionResult result;
   result.text = fullText;
-  result.task = "transcription";
+  result.task = "transcribe";
 
   if (verbose) {
     result.language = languageOption.empty() ? "english" : languageOption;

From dd89b5cd31be30890825c1e980ea86d5be6810b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 11:21:32 +0100
Subject: [PATCH 40/49] chore: add suggestions from code review

---
 .../common/rnexecutorch/host_objects/JsiConversions.h     | 8 ++++----
 .../models/speech_to_text/types/TranscriptionResult.h     | 2 +-
 .../hooks/natural_language_processing/useSpeechToText.ts  | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 4518d65dc..4d29a82fa 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -541,12 +541,12 @@ inline jsi::Value getJsiValue(const TranscriptionResult &result,
     }
     obj.setProperty(runtime, "duration", result.duration);
 
-    jsi::Array segmentsAry(runtime, result.segments.size());
+    jsi::Array segmentsArray(runtime, result.segments.size());
     for (size_t i = 0; i < result.segments.size(); ++i) {
-      segmentsAry.setValueAtIndex(runtime, i,
-                                  getJsiValue(result.segments[i], runtime));
+      segmentsArray.setValueAtIndex(runtime, i,
+                                    getJsiValue(result.segments[i], runtime));
     }
-    obj.setProperty(runtime, "segments", segmentsAry);
+    obj.setProperty(runtime, "segments", segmentsArray);
   }
 
   return obj;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
index 4fcf050a4..5fe3868da 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
@@ -7,7 +7,7 @@ namespace rnexecutorch::models::speech_to_text::types {
 
 struct TranscriptionResult {
   std::string text;
-  std::string task = "transcription";
+  std::string task;
   std::string language;
   double duration = 0.0;
   std::vector<Segment> segments; // Populated only if verbose=true
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 55ac3ce46..b42279b23 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -106,13 +106,13 @@ export const useSpeechToText = ({
       if (!isReady) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded.'
+          'The model is currently not loaded. Please load the model before calling this function.'
         );
       }
       if (isGenerating) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating.'
+          'The model is currently generating. Please wait until previous model run is complete.'
         );
       }
 

From 8478203d045f8c7f4cbcb8440eb10b6abf947ca7 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 11 Feb 2026 11:25:04 +0100
Subject: [PATCH 41/49] Apply suggestions from code review

---
 .../src/hooks/natural_language_processing/useSpeechToText.ts  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index b42279b23..83fdf4e10 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -74,13 +74,13 @@ export const useSpeechToText = ({
       if (!isReady) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
-          'The model is currently not loaded.'
+          'The model is currently not loaded. Please load the model before calling this function.'
         );
       }
       if (isGenerating) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
-          'The model is currently generating.'
+          'The model is currently generating. Please wait until previous model run is complete.'
         );
       }
 

From 2c41fc0002db778af84ef50428bd431f11b29f18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 11:33:25 +0100
Subject: [PATCH 42/49] docs: update types in transcription results to make
 them more explicit

---
 .../interfaces/TranscriptionResult.md              | 14 +++++++-------
 packages/react-native-executorch/src/types/stt.ts  |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionResult.md b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
index 64138435a..260af8b40 100644
--- a/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
@@ -1,6 +1,6 @@
 # Interface: TranscriptionResult
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:253](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L253)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:253](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L253)
 
 Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
 
@@ -10,7 +10,7 @@ Structure that represent result of transcription for a one function call (either
 
 > **duration**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:256](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L256)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:256](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L256)
 
 Duration in seconds of a given transcription.
 
@@ -20,7 +20,7 @@ Duration in seconds of a given transcription.
 
 > **language**: `string`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:255](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L255)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:255](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L255)
 
 Language chosen for transcription.
 
@@ -30,7 +30,7 @@ Language chosen for transcription.
 
 > `optional` **segments**: [`TranscriptionSegment`](TranscriptionSegment.md)[]
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:258](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L258)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:258](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L258)
 
 If `verbose` set to `true` in `DecodingOptions`, it contains array of
 `TranscriptionSegment` with details split into separate transcription segments.
@@ -39,9 +39,9 @@ If `verbose` set to `true` in `DecodingOptions`, it contains array of
 
 ### task?
 
-> `optional` **task**: `string`
+> `optional` **task**: `"transcribe"` \| `"stream"`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:254](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L254)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:254](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L254)
 
 String indicating task, either 'transcribe' or 'stream'.
 
@@ -51,6 +51,6 @@ String indicating task, either 'transcribe' or 'stream'.
 
 > **text**: `string`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:257](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L257)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:257](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L257)
 
 The whole text of a transcription as a `string`.
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index b50f573cc..8addc4f81 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -243,7 +243,7 @@ export interface TranscriptionSegment {
  * Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
  *
  * @category Types
- * @property {string} [task] - String indicating task, either 'transcribe' or 'stream'.
+ * @property {'transcribe' | 'stream'} [task] - String indicating task, either 'transcribe' or 'stream'.
  * @property {string} [language] - Language chosen for transcription.
  * @property {number} [duration] - Duration in seconds of a given transcription.
  * @property {string} [text] - The whole text of a transcription as a `string`.
@@ -251,7 +251,7 @@ export interface TranscriptionSegment {
  * `TranscriptionSegment` with details split into separate transcription segments.
  */
 export interface TranscriptionResult {
-  task?: string;
+  task?: 'transcribe' | 'stream';
   language: string;
   duration: number;
   text: string;

From 52128fc8d2492b6c391cb5d1ab20f788b317a2e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 11:34:56 +0100
Subject: [PATCH 43/49] chore: update naming convention

---
 .../rnexecutorch/host_objects/JsiConversions.h       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 4d29a82fa..0122c114c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -503,7 +503,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   obj.setProperty(runtime, "temperature", seg.temperature);
   obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
 
-  jsi::Array wordsAry(runtime, seg.words.size());
+  jsi::Array wordsArray(runtime, seg.words.size());
   for (size_t i = 0; i < seg.words.size(); ++i) {
     jsi::Object wordObj(runtime);
     wordObj.setProperty(
@@ -513,15 +513,15 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
                         static_cast<double>(seg.words[i].start));
     wordObj.setProperty(runtime, "end", static_cast<double>(seg.words[i].end));
 
-    wordsAry.setValueAtIndex(runtime, i, wordObj);
+    wordsArray.setValueAtIndex(runtime, i, wordObj);
   }
-  obj.setProperty(runtime, "words", wordsAry);
+  obj.setProperty(runtime, "words", wordsArray);
 
-  jsi::Array tokensAry(runtime, seg.tokens.size());
+  jsi::Array tokensArray(runtime, seg.tokens.size());
   for (size_t i = 0; i < seg.tokens.size(); ++i) {
-    tokensAry.setValueAtIndex(runtime, i, static_cast<double>(seg.tokens[i]));
+    tokensArray.setValueAtIndex(runtime, i, static_cast<double>(seg.tokens[i]));
   }
-  obj.setProperty(runtime, "tokens", tokensAry);
+  obj.setProperty(runtime, "tokens", tokensArray);
 
   return obj;
 }

From b204c9e7a56c74d1b3435e0ea491d04cc2876466 Mon Sep 17 00:00:00 2001
From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com>
Date: Wed, 11 Feb 2026 14:03:47 +0100
Subject: [PATCH 44/49] Apply suggestions from code review

---
 apps/speech/components/VerboseTranscription.tsx      |  6 +++---
 .../useSpeechToText.md                               |  6 +++---
 .../SpeechToTextModule.md                            |  8 ++++----
 .../rnexecutorch/host_objects/JsiConversions.h       |  6 +++---
 packages/react-native-executorch/src/types/stt.ts    | 12 ++++++------
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 7c7f76d33..8cc8c6201 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -76,7 +76,7 @@ export const VerboseTranscription = ({
                   <Text style={styles.statLabel}>Avg LogProb</Text>
                   <Text style={styles.statValue}>
                     {data.task === 'transcribe'
-                      ? seg.avg_logprob?.toFixed(4)
+                      ? seg.avgLogprob?.toFixed(4)
                       : 'N/A'}
                   </Text>
                 </View>
@@ -84,7 +84,7 @@ export const VerboseTranscription = ({
                   <Text style={styles.statLabel}>No Speech</Text>
                   <Text style={styles.statValue}>
                     {data.task === 'transcribe'
-                      ? seg.no_speech_prob?.toFixed(4)
+                      ? seg.noSpeechProb?.toFixed(4)
                       : 'N/A'}
                   </Text>
                 </View>
@@ -101,7 +101,7 @@ export const VerboseTranscription = ({
                   <Text style={styles.statLabel}>Compr.</Text>
                   <Text style={styles.statValue}>
                     {data.task === 'transcribe'
-                      ? seg.compression_ratio?.toFixed(2)
+                      ? seg.compressionRatio?.toFixed(2)
                       : 'N/A'}
                   </Text>
                 </View>
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 464c7ffc8..d0f6ab3bb 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -129,9 +129,9 @@ const transcription = await model.transcribe(audioBuffer, { verbose: true });
 //       ]
 //       tokens: [1, 32, 45, ...]
 //       temperature: 0.0
-//       avg_logprob: -1.235
-//       compression_ratio: 1.632
-//       no_speech_prob: 0.04
+//       avgLogprob: -1.235
+//       compressionRatio: 1.632
+//       noSpeechProb: 0.04
 //     },
 //     ...
 //   ]
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index cc84eb5e8..eb494fa3c 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -104,9 +104,9 @@ const transcription = await model.transcribe(audioBuffer, { verbose: true });
 //       ]
 //       tokens: [1, 32, 45, ...]
 //       temperature: 0.0
-//       avg_logprob: -1.235
-//       compression_ratio: 1.632
-//       no_speech_prob: 0.04
+//       avgLogprob: -1.235
+//       compressionRatio: 1.632
+//       noSpeechProb: 0.04
 //     },
 //     ...
 //   ]
@@ -152,7 +152,7 @@ const transcribeAudio = async () => {
     });
 
     console.log('Full Text:', resultVerbose.text);
-    console.log('Segments:', resultVerbose.segments); // Contains start/end/avg_logprob
+    console.log('Segments:', resultVerbose.segments); // Contains start/end/more parameters
   } catch (error) {
     console.error('Error during audio transcription', error);
   }
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 0122c114c..d5d74750a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -498,10 +498,10 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   obj.setProperty(runtime, "text",
                   jsi::String::createFromUtf8(runtime, segText));
 
-  obj.setProperty(runtime, "avg_logprob", seg.avgLogprob);
-  obj.setProperty(runtime, "compression_ratio", seg.compressionRatio);
+  obj.setProperty(runtime, "avgLogprob", seg.avgLogprob);
+  obj.setProperty(runtime, "compressionRatio", seg.compressionRatio);
   obj.setProperty(runtime, "temperature", seg.temperature);
-  obj.setProperty(runtime, "no_speech_prob", seg.noSpeechProbability);
+  obj.setProperty(runtime, "noSpeechProb", seg.noSpeechProbability);
 
   jsi::Array wordsArray(runtime, seg.words.size());
   for (size_t i = 0; i < seg.words.size(); ++i) {
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 8addc4f81..9bbb9bb10 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -222,9 +222,9 @@ export interface Word {
  * as an array of `Word`.
  * @property {number[]} [tokens] - Raw tokens represented as table of integers.
  * @property {number} [temperature] - Temperature for which given segment was computed.
- * @property {number} [avg_logprob] - Average log probability calculated across all tokens in a segment.
- * @property {number} [compression_ratio] - Compression ration achieved on a given segment.
- * @property {number} [no_speech_prob] - No speech probability, the probability that segment contains silence,
+ * @property {number} [avgLogprob] - Average log probability calculated across all tokens in a segment.
+ * @property {number} [compressionRatio] - Compression ration achieved on a given segment.
+ * @property {number} [noSpeechProb] - No speech probability, the probability that segment contains silence,
  * background noise etc.
  */
 export interface TranscriptionSegment {
@@ -234,9 +234,9 @@ export interface TranscriptionSegment {
   words?: Word[];
   tokens: number[];
   temperature: number;
-  avg_logprob: number;
-  compression_ratio: number;
-  no_speech_prob: number;
+  avgLogprob: number;
+  compressionRatio: number;
+  noSpeechProb: number;
 }
 
 /**

From 933e830889fe154b1435dc9adff0aecdc3f3a831 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 14:06:45 +0100
Subject: [PATCH 45/49] docs: Update api reference after naming changes

---
 .../interfaces/TranscriptionSegment.md        | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
index 885782d92..0e34797ea 100644
--- a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
@@ -1,26 +1,26 @@
 # Interface: TranscriptionSegment
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L230)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L230)
 
 Structure that represent single Segment of transcription.
 
 ## Properties
 
-### avg_logprob
+### avgLogprob
 
-> **avg_logprob**: `number`
+> **avgLogprob**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:237](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L237)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:237](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L237)
 
 Average log probability calculated across all tokens in a segment.
 
 ---
 
-### compression_ratio
+### compressionRatio
 
-> **compression_ratio**: `number`
+> **compressionRatio**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:238](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L238)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:238](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L238)
 
 Compression ration achieved on a given segment.
 
@@ -30,17 +30,17 @@ Compression ration achieved on a given segment.
 
 > **end**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L232)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L232)
 
 Timestamp of the end of the segment in audio (in seconds).
 
 ---
 
-### no_speech_prob
+### noSpeechProb
 
-> **no_speech_prob**: `number`
+> **noSpeechProb**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:239](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L239)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:239](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L239)
 
 No speech probability, the probability that segment contains silence,
 background noise etc.
@@ -51,7 +51,7 @@ background noise etc.
 
 > **start**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L231)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L231)
 
 Timestamp of the beginning of the segment in audio (in seconds).
 
@@ -61,7 +61,7 @@ Timestamp of the beginning of the segment in audio (in seconds).
 
 > **temperature**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L236)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L236)
 
 Temperature for which given segment was computed.
 
@@ -71,7 +71,7 @@ Temperature for which given segment was computed.
 
 > **text**: `string`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L233)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L233)
 
 Full text of the given segment as a string.
 
@@ -81,7 +81,7 @@ Full text of the given segment as a string.
 
 > **tokens**: `number`[]
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L235)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L235)
 
 Raw tokens represented as table of integers.
 
@@ -91,7 +91,7 @@ Raw tokens represented as table of integers.
 
 > `optional` **words**: [`Word`](Word.md)[]
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L234)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L234)
 
 If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
 as an array of `Word`.

From 4db9100fd1b965890481d42a8327ca09848a269a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 14:55:35 +0100
Subject: [PATCH 46/49] Add noSpeechProb

---
 .../models/speech_to_text/asr/ASR.cpp         | 37 +++++++++++++------
 .../models/speech_to_text/asr/ASR.h           | 10 ++---
 .../speech_to_text/types/GenerationResult.h   |  1 +
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 607b0816e..c5b95efd1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -17,7 +17,8 @@ ASR::ASR(const models::BaseModel *encoder, const models::BaseModel *decoder,
       startOfTranscriptionToken(
           this->tokenizer->tokenToId("<|startoftranscript|>")),
       endOfTranscriptionToken(this->tokenizer->tokenToId("<|endoftext|>")),
-      timestampBeginToken(this->tokenizer->tokenToId("<|0.00|>")) {}
+      timestampBeginToken(this->tokenizer->tokenToId("<|0.00|>")),
+      noSpeechToken(this->tokenizer->tokenToId("<|notimestamps|>")) {}
 
 std::vector<uint64_t>
 ASR::getInitialSequence(const DecodingOptions &options) const {
@@ -45,6 +46,9 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
   const size_t initialSequenceLenght = sequenceIds.size();
   std::vector<float> scores;
 
+  float noSpeechProb = 0.0f;
+  bool isFirstStep = true;
+
   while (std::cmp_less_equal(sequenceIds.size(), ASR::kMaxDecodeLength)) {
     std::vector<float> logits = this->decode(sequenceIds, encoderOutput);
 
@@ -58,6 +62,11 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
 
     const std::vector<float> &probs = logits;
 
+    if (isFirstStep) {
+      noSpeechProb = probs[this->noSpeechToken];
+      isFirstStep = false;
+    }
+
     uint64_t nextId;
     float nextProb;
 
@@ -84,7 +93,8 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
 
   return {.tokens = std::vector<uint64_t>(
               sequenceIds.cbegin() + initialSequenceLenght, sequenceIds.cend()),
-          .scores = scores};
+          .scores = scores,
+          .noSpeechProb = noSpeechProb};
 }
 
 float ASR::getCompressionRatio(const std::string &text) const {
@@ -100,9 +110,10 @@ ASR::generateWithFallback(std::span<float> waveform,
   float bestAvgLogProb = -std::numeric_limits<float>::infinity();
   float bestCompressionRatio = 0.0f;
   float bestTemperature = 0.0f;
+  float bestNoSpeechProb = 0.0f;
 
   for (auto t : temperatures) {
-    auto [tokens, scores] = this->generate(waveform, t, options);
+    auto [tokens, scores, noSpeechProb] = this->generate(waveform, t, options);
 
     const float cumLogProb = std::transform_reduce(
         scores.begin(), scores.end(), 0.0f, std::plus<>(),
@@ -117,6 +128,7 @@ ASR::generateWithFallback(std::span<float> waveform,
       bestAvgLogProb = avgLogProb;
       bestCompressionRatio = compressionRatio;
       bestTemperature = t;
+      bestNoSpeechProb = noSpeechProb;
       break;
     }
 
@@ -125,19 +137,19 @@ ASR::generateWithFallback(std::span<float> waveform,
       bestAvgLogProb = avgLogProb;
       bestCompressionRatio = compressionRatio;
       bestTemperature = t;
+      bestNoSpeechProb = noSpeechProb;
     }
   }
 
-  return this->calculateWordLevelTimestamps(bestTokens, waveform,
-                                            bestAvgLogProb, bestTemperature,
-                                            bestCompressionRatio);
+  return this->calculateWordLevelTimestamps(
+      bestTokens, waveform, bestAvgLogProb, bestTemperature,
+      bestCompressionRatio, bestNoSpeechProb);
 }
 
-std::vector<Segment>
-ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
-                                  const std::span<const float> waveform,
-                                  float avgLogProb, float temperature,
-                                  float compressionRatio) const {
+std::vector<Segment> ASR::calculateWordLevelTimestamps(
+    std::span<const uint64_t> generatedTokens,
+    const std::span<const float> waveform, float avgLogProb, float temperature,
+    float compressionRatio, float noSpeechProb) const {
   const size_t generatedTokensSize = generatedTokens.size();
   if (generatedTokensSize < 2 ||
       generatedTokens[generatedTokensSize - 1] !=
@@ -165,7 +177,7 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
         seg.compressionRatio = compressionRatio;
-        seg.noSpeechProbability = 0.0;
+        seg.noSpeechProbability = noSpeechProb;
 
         if (!seg.words.empty()) {
           seg.start = seg.words.front().start;
@@ -192,6 +204,7 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
   seg.avgLogprob = avgLogProb;
   seg.temperature = temperature;
   seg.compressionRatio = compressionRatio;
+  seg.noSpeechProbability = noSpeechProb;
 
   if (!seg.words.empty()) {
     seg.start = seg.words.front().start;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
index 16a2f45e6..02306fc7d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -28,6 +28,7 @@ class ASR {
   uint64_t startOfTranscriptionToken;
   uint64_t endOfTranscriptionToken;
   uint64_t timestampBeginToken;
+  uint64_t noSpeechToken;
 
   // Time precision used by Whisper timestamps: each token spans 0.02 seconds
   constexpr static float kTimePrecision = 0.02f;
@@ -51,11 +52,10 @@ class ASR {
   std::vector<types::Segment>
   generateWithFallback(std::span<float> waveform,
                        const types::DecodingOptions &options) const;
-  std::vector<Segment>
-  calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
-                               const std::span<const float> waveform,
-                               float avgLogProb, float temperature,
-                               float compressionRatio) const;
+  std::vector<Segment> calculateWordLevelTimestamps(
+      std::span<const uint64_t> generatedTokens,
+      const std::span<const float> waveform, float avgLogProb,
+      float temperature, float compressionRatio, float noSpeechProb) const;
   std::vector<types::Word>
   estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
                                     uint64_t start, uint64_t end) const;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
index 83bc80dd7..b15e71d27 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
@@ -7,6 +7,7 @@ namespace rnexecutorch::models::speech_to_text::types {
 struct GenerationResult {
   std::vector<uint64_t> tokens;
   std::vector<float> scores;
+  float noSpeechProb;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types

From 4b38855a54ece5e2d2b36eaeb20f089a57ea3288 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 15:25:07 +0100
Subject: [PATCH 47/49] Revert "Add noSpeechProb"

This reverts commit 4db9100fd1b965890481d42a8327ca09848a269a.
---
 .../models/speech_to_text/asr/ASR.cpp         | 37 ++++++-------------
 .../models/speech_to_text/asr/ASR.h           | 10 ++---
 .../speech_to_text/types/GenerationResult.h   |  1 -
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index c5b95efd1..607b0816e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -17,8 +17,7 @@ ASR::ASR(const models::BaseModel *encoder, const models::BaseModel *decoder,
       startOfTranscriptionToken(
           this->tokenizer->tokenToId("<|startoftranscript|>")),
       endOfTranscriptionToken(this->tokenizer->tokenToId("<|endoftext|>")),
-      timestampBeginToken(this->tokenizer->tokenToId("<|0.00|>")),
-      noSpeechToken(this->tokenizer->tokenToId("<|notimestamps|>")) {}
+      timestampBeginToken(this->tokenizer->tokenToId("<|0.00|>")) {}
 
 std::vector<uint64_t>
 ASR::getInitialSequence(const DecodingOptions &options) const {
@@ -46,9 +45,6 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
   const size_t initialSequenceLenght = sequenceIds.size();
   std::vector<float> scores;
 
-  float noSpeechProb = 0.0f;
-  bool isFirstStep = true;
-
   while (std::cmp_less_equal(sequenceIds.size(), ASR::kMaxDecodeLength)) {
     std::vector<float> logits = this->decode(sequenceIds, encoderOutput);
 
@@ -62,11 +58,6 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
 
     const std::vector<float> &probs = logits;
 
-    if (isFirstStep) {
-      noSpeechProb = probs[this->noSpeechToken];
-      isFirstStep = false;
-    }
-
     uint64_t nextId;
     float nextProb;
 
@@ -93,8 +84,7 @@ GenerationResult ASR::generate(std::span<float> waveform, float temperature,
 
   return {.tokens = std::vector<uint64_t>(
               sequenceIds.cbegin() + initialSequenceLenght, sequenceIds.cend()),
-          .scores = scores,
-          .noSpeechProb = noSpeechProb};
+          .scores = scores};
 }
 
 float ASR::getCompressionRatio(const std::string &text) const {
@@ -110,10 +100,9 @@ ASR::generateWithFallback(std::span<float> waveform,
   float bestAvgLogProb = -std::numeric_limits<float>::infinity();
   float bestCompressionRatio = 0.0f;
   float bestTemperature = 0.0f;
-  float bestNoSpeechProb = 0.0f;
 
   for (auto t : temperatures) {
-    auto [tokens, scores, noSpeechProb] = this->generate(waveform, t, options);
+    auto [tokens, scores] = this->generate(waveform, t, options);
 
     const float cumLogProb = std::transform_reduce(
         scores.begin(), scores.end(), 0.0f, std::plus<>(),
@@ -128,7 +117,6 @@ ASR::generateWithFallback(std::span<float> waveform,
       bestAvgLogProb = avgLogProb;
       bestCompressionRatio = compressionRatio;
       bestTemperature = t;
-      bestNoSpeechProb = noSpeechProb;
       break;
     }
 
@@ -137,19 +125,19 @@ ASR::generateWithFallback(std::span<float> waveform,
       bestAvgLogProb = avgLogProb;
       bestCompressionRatio = compressionRatio;
       bestTemperature = t;
-      bestNoSpeechProb = noSpeechProb;
     }
   }
 
-  return this->calculateWordLevelTimestamps(
-      bestTokens, waveform, bestAvgLogProb, bestTemperature,
-      bestCompressionRatio, bestNoSpeechProb);
+  return this->calculateWordLevelTimestamps(bestTokens, waveform,
+                                            bestAvgLogProb, bestTemperature,
+                                            bestCompressionRatio);
 }
 
-std::vector<Segment> ASR::calculateWordLevelTimestamps(
-    std::span<const uint64_t> generatedTokens,
-    const std::span<const float> waveform, float avgLogProb, float temperature,
-    float compressionRatio, float noSpeechProb) const {
+std::vector<Segment>
+ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
+                                  const std::span<const float> waveform,
+                                  float avgLogProb, float temperature,
+                                  float compressionRatio) const {
   const size_t generatedTokensSize = generatedTokens.size();
   if (generatedTokensSize < 2 ||
       generatedTokens[generatedTokensSize - 1] !=
@@ -177,7 +165,7 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
         seg.compressionRatio = compressionRatio;
-        seg.noSpeechProbability = noSpeechProb;
+        seg.noSpeechProbability = 0.0;
 
         if (!seg.words.empty()) {
           seg.start = seg.words.front().start;
@@ -204,7 +192,6 @@ std::vector<Segment> ASR::calculateWordLevelTimestamps(
   seg.avgLogprob = avgLogProb;
   seg.temperature = temperature;
   seg.compressionRatio = compressionRatio;
-  seg.noSpeechProbability = noSpeechProb;
 
   if (!seg.words.empty()) {
     seg.start = seg.words.front().start;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
index 02306fc7d..16a2f45e6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -28,7 +28,6 @@ class ASR {
   uint64_t startOfTranscriptionToken;
   uint64_t endOfTranscriptionToken;
   uint64_t timestampBeginToken;
-  uint64_t noSpeechToken;
 
   // Time precision used by Whisper timestamps: each token spans 0.02 seconds
   constexpr static float kTimePrecision = 0.02f;
@@ -52,10 +51,11 @@ class ASR {
   std::vector<types::Segment>
   generateWithFallback(std::span<float> waveform,
                        const types::DecodingOptions &options) const;
-  std::vector<Segment> calculateWordLevelTimestamps(
-      std::span<const uint64_t> generatedTokens,
-      const std::span<const float> waveform, float avgLogProb,
-      float temperature, float compressionRatio, float noSpeechProb) const;
+  std::vector<Segment>
+  calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
+                               const std::span<const float> waveform,
+                               float avgLogProb, float temperature,
+                               float compressionRatio) const;
   std::vector<types::Word>
   estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
                                     uint64_t start, uint64_t end) const;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
index b15e71d27..83bc80dd7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h
@@ -7,7 +7,6 @@ namespace rnexecutorch::models::speech_to_text::types {
 struct GenerationResult {
   std::vector<uint64_t> tokens;
   std::vector<float> scores;
-  float noSpeechProb;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types

From 2ec725e8e8b3c0559bdd6437f8d19e2c8371d086 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 15:36:26 +0100
Subject: [PATCH 48/49] chore: Remove noSpeechProb as it is no supported

---
 .../components/VerboseTranscription.tsx       |  8 -----
 .../useSpeechToText.md                        | 17 +++++------
 .../SpeechToTextModule.md                     | 17 +++++------
 .../interfaces/TranscriptionSegment.md        | 29 ++++++-------------
 .../host_objects/JsiConversions.h             |  1 -
 .../models/speech_to_text/SpeechToText.cpp    |  1 -
 .../models/speech_to_text/asr/ASR.cpp         |  1 -
 .../models/speech_to_text/types/Segment.h     |  1 -
 .../react-native-executorch/src/types/stt.ts  |  3 --
 9 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
index 8cc8c6201..1093b2bd1 100644
--- a/apps/speech/components/VerboseTranscription.tsx
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -80,14 +80,6 @@ export const VerboseTranscription = ({
                       : 'N/A'}
                   </Text>
                 </View>
-                <View style={styles.statItem}>
-                  <Text style={styles.statLabel}>No Speech</Text>
-                  <Text style={styles.statValue}>
-                    {data.task === 'transcribe'
-                      ? seg.noSpeechProb?.toFixed(4)
-                      : 'N/A'}
-                  </Text>
-                </View>
                 <View style={styles.statItem}>
                   <Text style={styles.statLabel}>Temp</Text>
                   <Text style={styles.statValue}>
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index d0f6ab3bb..ce1aa3f06 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -116,22 +116,21 @@ const transcription = await model.transcribe(audioBuffer, { verbose: true });
 //   language: "en",
 //   segments: [
 //     {
-//       start: 0;
-//       end: 5.4;
-//       text: "Example text for";
+//       start: 0,
+//       end: 5.4,
+//       text: "Example text for",
 //       words: [
 //         {
-//            word: "Example"
+//            word: "Example",
 //            start: 0,
-//            end: 1.4,
+//            end: 1.4
 //         },
 //         ...
 //       ]
-//       tokens: [1, 32, 45, ...]
-//       temperature: 0.0
-//       avgLogprob: -1.235
+//       tokens: [1, 32, 45, ...],
+//       temperature: 0.0,
+//       avgLogprob: -1.235,
 //       compressionRatio: 1.632
-//       noSpeechProb: 0.04
 //     },
 //     ...
 //   ]
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index eb494fa3c..cd0ec610f 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -91,22 +91,21 @@ const transcription = await model.transcribe(audioBuffer, { verbose: true });
 //   language: "en",
 //   segments: [
 //     {
-//       start: 0;
-//       end: 5.4;
-//       text: "Example text for";
+//       start: 0,
+//       end: 5.4,
+//       text: "Example text for",
 //       words: [
 //         {
-//            word: "Example"
+//            word: "Example",
 //            start: 0,
-//            end: 1.4,
+//            end: 1.4
 //         },
 //         ...
 //       ]
-//       tokens: [1, 32, 45, ...]
-//       temperature: 0.0
-//       avgLogprob: -1.235
+//       tokens: [1, 32, 45, ...],
+//       temperature: 0.0,
+//       avgLogprob: -1.235,
 //       compressionRatio: 1.632
-//       noSpeechProb: 0.04
 //     },
 //     ...
 //   ]
diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
index 0e34797ea..10246ac43 100644
--- a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
@@ -1,6 +1,6 @@
 # Interface: TranscriptionSegment
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L230)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:228](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L228)
 
 Structure that represent single Segment of transcription.
 
@@ -10,7 +10,7 @@ Structure that represent single Segment of transcription.
 
 > **avgLogprob**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:237](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L237)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L235)
 
 Average log probability calculated across all tokens in a segment.
 
@@ -20,7 +20,7 @@ Average log probability calculated across all tokens in a segment.
 
 > **compressionRatio**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:238](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L238)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L236)
 
 Compression ration achieved on a given segment.
 
@@ -30,28 +30,17 @@ Compression ration achieved on a given segment.
 
 > **end**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L232)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L230)
 
 Timestamp of the end of the segment in audio (in seconds).
 
 ---
 
-### noSpeechProb
-
-> **noSpeechProb**: `number`
-
-Defined in: [packages/react-native-executorch/src/types/stt.ts:239](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L239)
-
-No speech probability, the probability that segment contains silence,
-background noise etc.
-
----
-
 ### start
 
 > **start**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L231)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:229](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L229)
 
 Timestamp of the beginning of the segment in audio (in seconds).
 
@@ -61,7 +50,7 @@ Timestamp of the beginning of the segment in audio (in seconds).
 
 > **temperature**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L236)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L234)
 
 Temperature for which given segment was computed.
 
@@ -71,7 +60,7 @@ Temperature for which given segment was computed.
 
 > **text**: `string`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L233)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L231)
 
 Full text of the given segment as a string.
 
@@ -81,7 +70,7 @@ Full text of the given segment as a string.
 
 > **tokens**: `number`[]
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L235)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L233)
 
 Raw tokens represented as table of integers.
 
@@ -91,7 +80,7 @@ Raw tokens represented as table of integers.
 
 > `optional` **words**: [`Word`](Word.md)[]
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/b204c9e7a56c74d1b3435e0ea491d04cc2876466/packages/react-native-executorch/src/types/stt.ts#L234)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L232)
 
 If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
 as an array of `Word`.
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index d5d74750a..df9abbdef 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -501,7 +501,6 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
   obj.setProperty(runtime, "avgLogprob", seg.avgLogprob);
   obj.setProperty(runtime, "compressionRatio", seg.compressionRatio);
   obj.setProperty(runtime, "temperature", seg.temperature);
-  obj.setProperty(runtime, "noSpeechProb", seg.noSpeechProbability);
 
   jsi::Array wordsArray(runtime, seg.words.size());
   for (size_t i = 0; i < seg.words.size(); ++i) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 6cf02ac5d..eef6d562c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -94,7 +94,6 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
     seg.words = words;
     seg.avgLogprob = std::nanf("0");
     seg.compressionRatio = std::nanf("0");
-    seg.noSpeechProbability = std::nanf("0");
     seg.temperature = std::nanf("0");
 
     res.segments.push_back(std::move(seg));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 607b0816e..2ed41ff22 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -165,7 +165,6 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
         seg.avgLogprob = avgLogProb;
         seg.temperature = temperature;
         seg.compressionRatio = compressionRatio;
-        seg.noSpeechProbability = 0.0;
 
         if (!seg.words.empty()) {
           seg.start = seg.words.front().start;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
index 14876625e..b673cbe6e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
@@ -11,7 +11,6 @@ struct Segment {
   float start;
   float end;
   float avgLogprob;
-  float noSpeechProbability;
   float temperature;
   float compressionRatio;
 };
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 9bbb9bb10..bf7fc6436 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -224,8 +224,6 @@ export interface Word {
  * @property {number} [temperature] - Temperature for which given segment was computed.
  * @property {number} [avgLogprob] - Average log probability calculated across all tokens in a segment.
  * @property {number} [compressionRatio] - Compression ration achieved on a given segment.
- * @property {number} [noSpeechProb] - No speech probability, the probability that segment contains silence,
- * background noise etc.
  */
 export interface TranscriptionSegment {
   start: number;
@@ -236,7 +234,6 @@ export interface TranscriptionSegment {
   temperature: number;
   avgLogprob: number;
   compressionRatio: number;
-  noSpeechProb: number;
 }
 
 /**

From 9680579cd98f377f00cb6c3727300904ba3f7f89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= <mateusz.sluszniak@swmansion.com>
Date: Wed, 11 Feb 2026 15:40:02 +0100
Subject: [PATCH 49/49] chore: Apply suggestions from code review

---
 .../src/hooks/natural_language_processing/useSpeechToText.ts  | 4 ++--
 .../modules/natural_language_processing/SpeechToTextModule.ts | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 83fdf4e10..fa3d8c685 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -1,4 +1,4 @@
-import { useEffect, useCallback, useState, useMemo } from 'react';
+import { useEffect, useCallback, useState } from 'react';
 import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
 import {
   DecodingOptions,
@@ -25,7 +25,7 @@ export const useSpeechToText = ({
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
 
-  const moduleInstance = useMemo(() => new SpeechToTextModule(), []);
+  const [moduleInstance, _] = useState(() => new SpeechToTextModule());
 
   useEffect(() => {
     if (preventLoad) return;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 4d604dd27..64f4e953f 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -61,7 +61,7 @@ export class SpeechToTextModule {
    * Unloads the model from memory.
    */
   public delete(): void {
-    this.nativeModule.unload();
+    this.nativeModule?.unload();
   }
 
   /**