diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 2e5092801..283af1481 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -104,3 +104,5 @@ POTTEDPLANT
 TVMONITOR
 sublist
 TTFT
+timestamping
+logprob
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
index 79a713c93..27b4ad35d 100644
--- a/apps/llm/app/voice_chat/index.tsx
+++ b/apps/llm/app/voice_chat/index.tsx
@@ -1,4 +1,4 @@
-import { useContext, useEffect, useRef, useState } from 'react';
+import { useContext, useEffect, useState } from 'react';
 import {
   Keyboard,
   KeyboardAvoidingView,
@@ -35,6 +35,8 @@ export default function VoiceChatScreenWrapper() {
 
 function VoiceChatScreen() {
   const [isRecording, setIsRecording] = useState(false);
+  const [liveTranscription, setLiveTranscription] = useState('');
+
   const [recorder] = useState(
     () =>
       new AudioRecorder({
@@ -42,7 +44,7 @@ function VoiceChatScreen() {
         bufferLengthInSamples: 1600,
       })
   );
-  const messageRecorded = useRef<boolean>(false);
+
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   const llm = useLLM({ model: QWEN3_0_6B_QUANTIZED });
@@ -67,16 +69,32 @@ function VoiceChatScreen() {
     if (isRecording) {
       setIsRecording(false);
       recorder.stop();
-      messageRecorded.current = true;
       speechToText.streamStop();
     } else {
       setIsRecording(true);
+      setLiveTranscription('');
+
       recorder.onAudioReady(({ buffer }) => {
         speechToText.streamInsert(buffer.getChannelData(0));
       });
       recorder.start();
-      const transcription = await speechToText.stream();
-      await llm.sendMessage(transcription);
+
+      let finalResult = '';
+
+      try {
+        for await (const result of speechToText.stream()) {
+          const text = result.committed.text + result.nonCommitted.text;
+          setLiveTranscription(text);
+          finalResult = text;
+        }
+      } catch (e) {
+        console.error('Streaming error:', e);
+      } finally {
+        if (finalResult.trim().length > 0) {
+          await llm.sendMessage(finalResult);
+          setLiveTranscription('');
+        }
+      }
     }
   };
 
@@ -96,16 +114,17 @@ function VoiceChatScreen() {
           <SWMIcon width={45} height={45} />
           <Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
         </View>
-        {llm.messageHistory.length || speechToText.committedTranscription ? (
+
+        {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
           <View style={styles.chatContainer}>
             <Messages
               chatHistory={
-                speechToText.isGenerating
+                isRecording && liveTranscription.length > 0
                   ? [
                       ...llm.messageHistory,
                       {
                         role: 'user',
-                        content: speechToText.committedTranscription,
+                        content: liveTranscription,
                       },
                     ]
                   : llm.messageHistory
@@ -123,6 +142,7 @@ function VoiceChatScreen() {
             </Text>
           </View>
         )}
+
         <View style={styles.bottomContainer}>
           {DeviceInfo.isEmulatorSync() ? (
             <View style={styles.emulatorBox}>
diff --git a/apps/speech/app.json b/apps/speech/app.json
index 693c815cb..1e6e36464 100644
--- a/apps/speech/app.json
+++ b/apps/speech/app.json
@@ -17,6 +17,9 @@
       "bundleIdentifier": "com.anonymous.speech",
       "infoPlist": {
         "NSMicrophoneUsageDescription": "This app needs access to your microphone to record audio."
+      },
+      "entitlements": {
+        "com.apple.developer.kernel.increased-memory-limit": true
       }
     },
     "android": {
@@ -24,11 +27,34 @@
         "foregroundImage": "./assets/adaptive-icon.png",
         "backgroundColor": "#ffffff"
       },
-      "package": "com.anonymous.speech"
+      "package": "com.anonymous.speech",
+      "permissions": [
+        "android.permission.RECORD_AUDIO",
+        "android.permission.MODIFY_AUDIO_SETTINGS",
+        "android.permission.FOREGROUND_SERVICE",
+        "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK"
+      ]
     },
     "web": {
       "favicon": "./assets/favicon.png"
     },
-    "plugins": ["expo-font"]
+    "plugins": [
+      "expo-font",
+      [
+        "react-native-audio-api",
+        {
+          "iosBackgroundMode": true,
+          "iosMicrophonePermission": "This app requires access to the microphone to record audio.",
+          "androidPermissions": [
+            "android.permission.MODIFY_AUDIO_SETTINGS",
+            "android.permission.FOREGROUND_SERVICE",
+            "android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK",
+            "android.permission.RECORD_AUDIO"
+          ],
+          "androidForegroundService": true,
+          "androidFSTypes": ["mediaPlayback", "microphone"]
+        }
+      ]
+    ]
   }
 }
diff --git a/apps/speech/components/VerboseTranscription.tsx b/apps/speech/components/VerboseTranscription.tsx
new file mode 100644
index 000000000..1093b2bd1
--- /dev/null
+++ b/apps/speech/components/VerboseTranscription.tsx
@@ -0,0 +1,241 @@
+import React from 'react';
+import { View, Text, StyleSheet } from 'react-native';
+import { TranscriptionResult } from 'react-native-executorch';
+
+export const VerboseTranscription = ({
+  data,
+}: {
+  data: TranscriptionResult;
+}) => {
+  if (!data) return null;
+
+  const hasSegments = Array.isArray(data.segments) && data.segments.length > 0;
+
+  const hasLanguage =
+    !!data.language && data.language !== 'N/A' && data.language.trim() !== '';
+
+  const hasDuration = typeof data.duration === 'number' && data.duration > 0;
+
+  const hasMetadata = hasLanguage || hasDuration;
+
+  return (
+    <View style={styles.container}>
+      <View style={styles.metaContainer}>
+        <Text style={styles.label}>Full Text:</Text>
+        <Text style={styles.text}>{data.text || ''}</Text>
+
+        {hasMetadata && (
+          <View style={styles.row}>
+            {hasLanguage && (
+              <Text style={styles.metaItem}>Language: {data.language}</Text>
+            )}
+            {hasDuration && (
+              <Text style={styles.metaItem}>
+                Duration: {data.duration?.toFixed(2)}s
+              </Text>
+            )}
+          </View>
+        )}
+      </View>
+
+      {hasSegments && (
+        <>
+          <Text style={styles.sectionHeader}>
+            Segments ({data.segments?.length})
+          </Text>
+
+          {data.segments?.map((seg, index) => (
+            <View key={index} style={styles.segmentCard}>
+              <View style={styles.segmentHeader}>
+                <Text style={styles.timeBadge}>
+                  {seg.start.toFixed(2)}s - {seg.end.toFixed(2)}s
+                </Text>
+                <Text style={styles.segmentId}>ID: {index}</Text>
+              </View>
+
+              <Text style={styles.segmentText}>"{seg.text}"</Text>
+
+              {seg.words && seg.words.length > 0 && (
+                <View style={styles.wordsContainer}>
+                  <Text style={styles.statLabel}>Word Timestamps:</Text>
+                  <View style={styles.wordsGrid}>
+                    {seg.words.map((w, wIdx) => (
+                      <View key={wIdx} style={styles.wordChip}>
+                        <Text style={styles.wordText}>{w.word.trim()}</Text>
+                        <Text style={styles.wordTime}>
+                          {w.start.toFixed(2)}s
+                        </Text>
+                      </View>
+                    ))}
+                  </View>
+                </View>
+              )}
+
+              <View style={styles.statsGrid}>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>Avg LogProb</Text>
+                  <Text style={styles.statValue}>
+                    {data.task === 'transcribe'
+                      ? seg.avgLogprob?.toFixed(4)
+                      : 'N/A'}
+                  </Text>
+                </View>
+                <View style={styles.statItem}>
+                  <Text style={styles.statLabel}>Temp</Text>
+                  <Text style={styles.statValue}>
+                    {data.task === 'transcribe'
+                      ? seg.temperature?.toFixed(2)
+                      : 'N/A'}
+                  </Text>
+                </View>
+                <View style={styles.statItem}>
+                  {/*eslint-disable-next-line @cspell/spellchecker*/}
+                  <Text style={styles.statLabel}>Compr.</Text>
+                  <Text style={styles.statValue}>
+                    {data.task === 'transcribe'
+                      ? seg.compressionRatio?.toFixed(2)
+                      : 'N/A'}
+                  </Text>
+                </View>
+              </View>
+            </View>
+          ))}
+        </>
+      )}
+    </View>
+  );
+};
+
+const styles = StyleSheet.create({
+  container: {
+    padding: 4,
+  },
+  metaContainer: {
+    marginBottom: 16,
+    padding: 12,
+    backgroundColor: '#f0f2f5',
+    borderRadius: 8,
+  },
+  label: {
+    fontWeight: 'bold',
+    color: '#0f186e',
+    marginBottom: 4,
+  },
+  text: {
+    fontSize: 16,
+    color: '#333',
+    marginBottom: 8,
+  },
+  row: {
+    flexDirection: 'row',
+    gap: 10,
+    marginTop: 8,
+  },
+  metaItem: {
+    fontSize: 12,
+    color: '#666',
+    backgroundColor: '#e1e4e8',
+    paddingHorizontal: 8,
+    paddingVertical: 2,
+    borderRadius: 4,
+    overflow: 'hidden',
+  },
+  sectionHeader: {
+    fontSize: 18,
+    fontWeight: 'bold',
+    color: '#0f186e',
+    marginBottom: 8,
+    marginTop: 8,
+  },
+  segmentCard: {
+    backgroundColor: '#fff',
+    borderRadius: 8,
+    borderWidth: 1,
+    borderColor: '#e1e4e8',
+    marginBottom: 12,
+    padding: 12,
+    shadowColor: '#000',
+    shadowOffset: { width: 0, height: 1 },
+    shadowOpacity: 0.1,
+    shadowRadius: 2,
+    elevation: 2,
+  },
+  segmentHeader: {
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+    marginBottom: 8,
+  },
+  timeBadge: {
+    fontSize: 12,
+    fontWeight: 'bold',
+    color: '#fff',
+    backgroundColor: '#0f186e',
+    paddingHorizontal: 8,
+    paddingVertical: 2,
+    borderRadius: 12,
+    overflow: 'hidden',
+  },
+  segmentId: {
+    fontSize: 12,
+    color: '#888',
+  },
+  segmentText: {
+    fontSize: 15,
+    fontStyle: 'italic',
+    color: '#333',
+    marginBottom: 12,
+  },
+  statsGrid: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 8,
+    borderTopWidth: 1,
+    borderTopColor: '#f0f0f0',
+    paddingTop: 8,
+  },
+  statItem: {
+    flex: 1,
+    minWidth: '45%',
+    flexDirection: 'row',
+    justifyContent: 'space-between',
+  },
+  statLabel: {
+    fontSize: 11,
+    color: '#888',
+  },
+  statValue: {
+    fontSize: 11,
+    fontWeight: '600',
+    color: '#444',
+  },
+  wordsContainer: {
+    marginVertical: 8,
+    backgroundColor: '#f8f9fa',
+    padding: 8,
+    borderRadius: 6,
+  },
+  wordsGrid: {
+    flexDirection: 'row',
+    flexWrap: 'wrap',
+    gap: 6,
+    marginTop: 4,
+  },
+  wordChip: {
+    backgroundColor: '#ffffff',
+    borderWidth: 1,
+    borderColor: '#e1e4e8',
+    borderRadius: 4,
+    paddingHorizontal: 6,
+    paddingVertical: 2,
+    alignItems: 'center',
+  },
+  wordText: {
+    fontSize: 12,
+    color: '#333',
+  },
+  wordTime: {
+    fontSize: 9,
+    color: '#888',
+    marginTop: 1,
+  },
+});
diff --git a/apps/speech/package.json b/apps/speech/package.json
index 094fa2b78..1c0607be2 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -19,7 +19,7 @@
     "metro-config": "^0.81.0",
     "react": "19.1.0",
     "react-native": "0.81.5",
-    "react-native-audio-api": "0.6.5",
+    "react-native-audio-api": "0.11.3",
     "react-native-device-info": "^14.0.4",
     "react-native-executorch": "workspace:*",
     "react-native-reanimated": "~4.1.1",
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index da7ed0f7e..06813dfcd 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -8,9 +8,14 @@ import {
   TextInput,
   KeyboardAvoidingView,
   Platform,
+  Switch,
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  TranscriptionResult,
+} from 'react-native-executorch';
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
   AudioManager,
@@ -21,6 +26,8 @@ import * as FileSystem from 'expo-file-system/legacy';
 import SWMIcon from '../assets/swm_icon.svg';
 import DeviceInfo from 'react-native-device-info';
 
+import { VerboseTranscription } from '../components/VerboseTranscription';
+
 const isSimulator = DeviceInfo.isEmulatorSync();
 
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
@@ -28,26 +35,34 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  const [transcription, setTranscription] =
+    useState<TranscriptionResult | null>(null);
+
+  const [liveResult, setLiveResult] = useState<{
+    fullText: string;
+    segments: any[];
+  } | null>(null);
+
+  const [enableTimestamps, setEnableTimestamps] = useState(false);
   const [audioURL, setAudioURL] = useState('');
+
+  const isRecordingRef = useRef(false);
   const [liveTranscribing, setLiveTranscribing] = useState(false);
   const scrollViewRef = useRef<ScrollView>(null);
 
-  const [recorder] = useState(
-    () =>
-      new AudioRecorder({
-        sampleRate: 16000,
-        bufferLengthInSamples: 1600,
-      })
-  );
+  const recorder = new AudioRecorder();
 
   useEffect(() => {
     AudioManager.setAudioSessionOptions({
       iosCategory: 'playAndRecord',
       iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
+      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
     });
-    AudioManager.requestRecordingPermissions();
+    const checkPerms = async () => {
+      const granted = await AudioManager.requestRecordingPermissions();
+      if (!granted) console.warn('Microphone permission denied!');
+    };
+    checkPerms();
   }, []);
 
   async function getAudioFile(sourceUri: string) {
@@ -72,40 +87,110 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     }
 
     const uri = await getAudioFile(audioURL);
+    // Reset previous states
+    setTranscription(null);
+    setLiveResult(null);
 
     const audioContext = new AudioContext({ sampleRate: 16000 });
 
     try {
-      const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+      const decodedAudioData = await audioContext.decodeAudioData(uri);
       const audioBuffer = decodedAudioData.getChannelData(0);
-      setTranscription(await model.transcribe(audioBuffer));
+      const result = await model.transcribe(audioBuffer, {
+        verbose: enableTimestamps,
+      });
+      setTranscription(result);
     } catch (error) {
       console.error('Error decoding audio data', error);
-      console.warn('Note: Supported file formats: mp3, wav, flac');
       return;
     }
   };
 
   const handleStartTranscribeFromMicrophone = async () => {
+    isRecordingRef.current = true;
     setLiveTranscribing(true);
-    setTranscription('');
-    recorder.onAudioReady(({ buffer }) => {
-      model.streamInsert(buffer.getChannelData(0));
-    });
-    recorder.start();
+
+    setTranscription(null);
+    setLiveResult({ fullText: '', segments: [] });
+
+    const sampleRate = 16000;
+
+    recorder.onAudioReady(
+      {
+        sampleRate,
+        bufferLength: 0.1 * sampleRate,
+        channelCount: 1,
+      },
+      ({ buffer }) => {
+        model.streamInsert(buffer.getChannelData(0));
+      }
+    );
+
+    try {
+      const success = await AudioManager.setAudioSessionActivity(true);
+      if (!success) {
+        console.warn('Cannot start audio session correctly');
+      }
+      const result = recorder.start();
+      if (result.status === 'error') {
+        console.warn('Recording problems: ', result.message);
+      }
+    } catch (e) {
+      console.error('Failed to start recorder', e);
+      isRecordingRef.current = false;
+      setLiveTranscribing(false);
+      return;
+    }
+
+    let accumulatedText = '';
+    let accumulatedSegments: any[] = [];
 
     try {
-      await model.stream();
+      const streamIter = model.stream({
+        verbose: enableTimestamps,
+      });
+
+      for await (const { committed, nonCommitted } of streamIter) {
+        if (!isRecordingRef.current) break;
+
+        if (committed.text) {
+          accumulatedText += committed.text;
+        }
+        if (committed.segments) {
+          accumulatedSegments = [...accumulatedSegments, ...committed.segments];
+        }
+
+        const currentDisplay = {
+          fullText: accumulatedText + nonCommitted.text,
+          segments: [...accumulatedSegments, ...(nonCommitted.segments || [])],
+        };
+
+        setLiveResult(currentDisplay);
+      }
     } catch (error) {
       console.error('Error during live transcription:', error);
+    } finally {
+      setLiveTranscribing(false);
     }
   };
 
   const handleStopTranscribeFromMicrophone = () => {
+    isRecordingRef.current = false;
+
     recorder.stop();
     model.streamStop();
     console.log('Live transcription stopped');
     setLiveTranscribing(false);
+
+    if (liveResult) {
+      setTranscription({
+        text: liveResult.fullText,
+        segments: liveResult.segments,
+        language: 'en',
+        duration: 0,
+      });
+      setLiveResult(null);
+    }
   };
 
   const getModelStatus = () => {
@@ -118,6 +203,20 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const readyToTranscribe = !model.isGenerating && model.isReady;
   const recordingButtonDisabled = isSimulator || !readyToTranscribe;
 
+  const getDisplayData = (): TranscriptionResult | null => {
+    if (liveTranscribing && liveResult) {
+      return {
+        text: liveResult.fullText,
+        segments: liveResult.segments,
+        language: 'en',
+        duration: 0,
+      };
+    }
+    return transcription;
+  };
+
+  const displayData = getDisplayData();
+
   return (
     <SafeAreaProvider>
       <SafeAreaView style={styles.container}>
@@ -138,6 +237,21 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
             <Text>Status: {getModelStatus()}</Text>
           </View>
 
+          <View style={styles.toggleContainer}>
+            <Text style={styles.toggleLabel}>Enable Timestamps (Verbose)</Text>
+            <Switch
+              value={enableTimestamps}
+              onValueChange={(val) => {
+                setEnableTimestamps(val);
+                setTranscription(null);
+                setLiveResult(null);
+              }}
+              trackColor={{ false: '#767577', true: '#0f186e' }}
+              thumbColor={enableTimestamps ? '#fff' : '#f4f3f4'}
+              disabled={model.isGenerating}
+            />
+          </View>
+
           <View style={styles.transcriptionContainer}>
             <Text style={styles.transcriptionLabel}>Transcription</Text>
             <ScrollView
@@ -147,12 +261,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
                 scrollViewRef.current?.scrollToEnd({ animated: true })
               }
             >
-              <Text>
-                {transcription !== ''
-                  ? transcription
-                  : model.committedTranscription +
-                    model.nonCommittedTranscription}
-              </Text>
+              {displayData ? (
+                <VerboseTranscription data={displayData} />
+              ) : (
+                <Text style={styles.placeholderText}>
+                  {liveTranscribing
+                    ? 'Listening...'
+                    : 'No transcription yet...'}
+                </Text>
+              )}
             </ScrollView>
           </View>
 
@@ -241,6 +358,17 @@ const styles = StyleSheet.create({
     marginTop: 12,
     alignItems: 'center',
   },
+  toggleContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    marginTop: 10,
+    marginBottom: 5,
+  },
+  toggleLabel: {
+    fontSize: 16,
+    marginRight: 10,
+    color: '#0f186e',
+  },
   transcriptionContainer: {
     flex: 1,
     width: '100%',
@@ -256,6 +384,11 @@ const styles = StyleSheet.create({
     borderWidth: 1,
     borderColor: '#0f186e',
     padding: 12,
+    maxHeight: 400,
+  },
+  placeholderText: {
+    color: '#aaa',
+    fontStyle: 'italic',
   },
   inputContainer: {
     marginBottom: 12,
diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
index 5b0545cf2..ce1aa3f06 100644
--- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
+++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md
@@ -47,12 +47,12 @@ const { uri } = await FileSystem.downloadAsync(
 );
 
 const audioContext = new AudioContext({ sampleRate: 16000 });
-const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
+const decodedAudioData = await audioContext.decodeAudioData(uri);
 const audioBuffer = decodedAudioData.getChannelData(0);
 
 try {
   const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription);
+  console.log(transcription.text);
 } catch (error) {
   console.error('Error during audio transcription', error);
 }
@@ -101,12 +101,52 @@ const model = useSpeechToText({
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps & Transcription Stat Data
+
+You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe) and [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/interfaces/SpeechToTextType.md#transcribe), [`stream`](../../06-api-reference/interfaces/SpeechToTextType.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+
+```typescript
+const transcription = await model.transcribe(audioBuffer, { verbose: true });
+// Example result
+//
+// transcription: {
+//   task: "transcription",
+//   text: "Example text for a ...",
+//   duration: 9.05,
+//   language: "en",
+//   segments: [
+//     {
+//       start: 0,
+//       end: 5.4,
+//       text: "Example text for",
+//       words: [
+//         {
+//            word: "Example",
+//            start: 0,
+//            end: 1.4
+//         },
+//         ...
+//       ]
+//       tokens: [1, 32, 45, ...],
+//       temperature: 0.0,
+//       avgLogprob: -1.235,
+//       compressionRatio: 1.632
+//     },
+//     ...
+//   ]
+// }
+```
+
 ## Example
 
 ```tsx
 import React, { useState } from 'react';
-import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
+import { Button, Text, View } from 'react-native';
+import {
+  useSpeechToText,
+  WHISPER_TINY_EN,
+  TranscriptionResult,
+} from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
@@ -115,7 +155,7 @@ function App() {
     model: WHISPER_TINY_EN,
   });
 
-  const [transcription, setTranscription] = useState('');
+  const [transcription, setTranscription] = useState<TranscriptionResult>(null);
 
   const loadAudio = async () => {
     const { uri } = await FileSystem.downloadAsync(
@@ -132,14 +172,45 @@ function App() {
 
   const handleTranscribe = async () => {
     const audio = await loadAudio();
-    await model.transcribe(audio);
+    // Default text transcription
+    const result = await model.transcribe(audio);
+    setTranscription(result);
+  };
+
+  const handleTranscribeWithTimestamps = async () => {
+    const audio = await loadAudio();
+    // Transcription with timestamps
+    const result = await model.transcribe(audio, { verbose: true });
+    setTranscription(result);
+  };
+
+  // Custom logic for printing transcription
+  // e.g.
+
+  const renderContent = () => {
+    if (!transcription) return <Text>Press a button to transcribe</Text>;
+
+    if (transcription.segments && transcription.segments.length > 0) {
+      return (
+        <Text>
+          {transcription.text +
+            '\n\nNum segments: ' +
+            transcription.segments.length.toString()}
+        </Text>
+      );
+    }
+    return <Text>{transcription.text}</Text>;
   };
 
   return (
-    <>
-      <Text>{transcription}</Text>
-      <Button onPress={handleTranscribe} title="Transcribe" />
-    </>
+    <View>
+      {renderContent()}
+      <Button onPress={handleTranscribe} title="Transcribe (Text)" />
+      <Button
+        onPress={handleTranscribeWithTimestamps}
+        title="Transcribe (Timestamps)"
+      />
+    </View>
   );
 }
 ```
@@ -147,24 +218,21 @@ function App() {
 ### Streaming transcription
 
 ```tsx
-import React, { useEffect, useState } from 'react';
-import { Text, Button } from 'react-native';
+import React, { useEffect, useState, useRef } from 'react';
+import { Text, Button, View, SafeAreaView } from 'react-native';
 import { useSpeechToText, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import * as FileSystem from 'expo-file-system';
 
-function App() {
+export default function App() {
   const model = useSpeechToText({
     model: WHISPER_TINY_EN,
   });
 
-  const [recorder] = useState(
-    () =>
-      new AudioRecorder({
-        sampleRate: 16000,
-        bufferLengthInSamples: 1600,
-      })
-  );
+  const [transcribedText, setTranscribedText] = useState('');
+
+  const isRecordingRef = useRef(false);
+
+  const [recorder] = useState(() => new AudioRecorder());
 
   useEffect(() => {
     AudioManager.setAudioSessionOptions({
@@ -176,35 +244,74 @@ function App() {
   }, []);
 
   const handleStartStreamingTranscribe = async () => {
-    recorder.onAudioReady(({ buffer }) => {
-      model.streamInsert(buffer.getChannelData(0));
-    });
-    recorder.start();
+    isRecordingRef.current = true;
+    setTranscribedText('');
+
+    const sampleRate = 16000;
+
+    recorder.onAudioReady(
+      {
+        sampleRate,
+        bufferLength: 0.1 * sampleRate,
+        channelCount: 1,
+      },
+      (chunk) => {
+        model.streamInsert(chunk.buffer.getChannelData(0));
+      }
+    );
+
+    try {
+      await recorder.start();
+    } catch (e) {
+      console.error('Recorder failed:', e);
+      return;
+    }
 
     try {
-      await model.stream();
+      let accumulatedCommitted = '';
+
+      const streamIter = model.stream({ verbose: false });
+
+      for await (const { committed, nonCommitted } of streamIter) {
+        if (!isRecordingRef.current) break;
+
+        if (committed.text) {
+          accumulatedCommitted += committed.text;
+        }
+
+        setTranscribedText(accumulatedCommitted + nonCommitted.text);
+      }
     } catch (error) {
       console.error('Error during streaming transcription:', error);
     }
   };
 
   const handleStopStreamingTranscribe = () => {
+    isRecordingRef.current = false;
     recorder.stop();
     model.streamStop();
   };
 
   return (
-    <>
-      <Text>
-        {model.committedTranscription}
-        {model.nonCommittedTranscription}
-      </Text>
-      <Button
-        onPress={handleStartStreamingTranscribe}
-        title="Start Streaming"
-      />
-      <Button onPress={handleStopStreamingTranscribe} title="Stop Streaming" />
-    </>
+    <SafeAreaView>
+      <View style={{ padding: 20 }}>
+        <Text style={{ marginBottom: 20, fontSize: 18 }}>
+          {transcribedText || 'Press start to speak...'}
+        </Text>
+
+        <Button
+          onPress={handleStartStreamingTranscribe}
+          title="Start Streaming"
+          disabled={model.isGenerating}
+        />
+        <View style={{ height: 10 }} />
+        <Button
+          onPress={handleStopStreamingTranscribe}
+          title="Stop Streaming"
+          color="red"
+        />
+      </View>
+    </SafeAreaView>
   );
 }
 ```
diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
index 0698cb548..cd0ec610f 100644
--- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
+++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md
@@ -19,7 +19,13 @@ await model.load(WHISPER_TINY_EN, (progress) => {
   console.log(progress);
 });
 
-await model.transcribe(waveform);
+// Standard transcription (returns string)
+const text = await model.transcribe(waveform);
+
+// Transcription with timestamps (returns Word[])
+const textWithTimestamps = await model.transcribe(waveform, {
+  enableTimestamps: true,
+});
 ```
 
 ### Methods
@@ -70,6 +76,42 @@ await model.load(WHISPER_TINY, (progress) => {
 const transcription = await model.transcribe(spanishAudio, { language: 'es' });
 ```
 
+### Timestamps & Transcription Stat Data
+
+You can obtain word-level timestamps and other useful parameters from transcription ([`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe) and [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream) methods) by setting `verbose: true` in the options. The result mimics the _verbose_json_ format from OpenAI Whisper API. For more information please read [`transcribe`](../../06-api-reference/classes/SpeechToTextModule.md#transcribe), [`stream`](../../06-api-reference/classes/SpeechToTextModule.md#stream), and [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md) API References.
+
+```typescript
+const transcription = await model.transcribe(audioBuffer, { verbose: true });
+// Example result
+//
+// transcription: {
+//   task: "transcription",
+//   text: "Example text for a ...",
+//   duration: 9.05,
+//   language: "en",
+//   segments: [
+//     {
+//       start: 0,
+//       end: 5.4,
+//       text: "Example text for",
+//       words: [
+//         {
+//            word: "Example",
+//            start: 0,
+//            end: 1.4
+//         },
+//         ...
+//       ]
+//       tokens: [1, 32, 45, ...],
+//       temperature: 0.0,
+//       avgLogprob: -1.235,
+//       compressionRatio: 1.632
+//     },
+//     ...
+//   ]
+// }
+```
+
 ## Example
 
 ### Transcription
@@ -79,27 +121,41 @@ import { SpeechToTextModule, WHISPER_TINY_EN } from 'react-native-executorch';
 import { AudioContext } from 'react-native-audio-api';
 import * as FileSystem from 'expo-file-system';
 
-// Load the model
-const model = new SpeechToTextModule();
-
-// Download the audio file
-const { uri } = await FileSystem.downloadAsync(
-  'https://some-audio-url.com/file.mp3',
-  FileSystem.cacheDirectory + 'audio_file'
-);
-
-// Decode the audio data
-const audioContext = new AudioContext({ sampleRate: 16000 });
-const decodedAudioData = await audioContext.decodeAudioDataSource(uri);
-const audioBuffer = decodedAudioData.getChannelData(0);
-
-// Transcribe the audio
-try {
-  const transcription = await model.transcribe(audioBuffer);
-  console.log(transcription);
-} catch (error) {
-  console.error('Error during audio transcription', error);
-}
+const transcribeAudio = async () => {
+  // Initialize with the model config
+  const model = new SpeechToTextModule();
+  await model.load(WHISPER_TINY_EN, (progress) => {
+    console.log(progress);
+  });
+
+  // Download the audio file
+  const { uri } = await FileSystem.downloadAsync(
+    'https://some-audio-url.com/file.mp3',
+    FileSystem.cacheDirectory + 'audio_file'
+  );
+
+  // Decode the audio data (Correct as per your previous code)
+  const audioContext = new AudioContext({ sampleRate: 16000 });
+  const decodedAudioData = await audioContext.decodeAudioData(uri);
+  const audioBuffer = decodedAudioData.getChannelData(0);
+
+  // Transcribe the audio
+  try {
+    // Option 1: Text only
+    const resultText = await model.transcribe(audioBuffer);
+    console.log('Text:', resultText.text); // .text is the standard property now
+
+    // Option 2: With timestamps (Use 'verbose' instead of 'enableTimestamps')
+    const resultVerbose = await model.transcribe(audioBuffer, {
+      verbose: true,
+    });
+
+    console.log('Full Text:', resultVerbose.text);
+    console.log('Segments:', resultVerbose.segments); // Contains start/end/more parameters
+  } catch (error) {
+    console.error('Error during audio transcription', error);
+  }
+};
 ```
 
 ### Streaming Transcription
@@ -120,27 +176,41 @@ AudioManager.setAudioSessionOptions({
   iosMode: 'spokenAudio',
   iosOptions: ['allowBluetooth', 'defaultToSpeaker'],
 });
-AudioManager.requestRecordingPermissions();
+await AudioManager.requestRecordingPermissions();
 
-// Initialize audio recorder
+// Initialize audio recorder with FULL config in constructor
 const recorder = new AudioRecorder({
   sampleRate: 16000,
-  bufferLengthInSamples: 1600,
+  channelCount: 1,
+  bitsPerSample: 16,
+  bufferLengthInSamples: 16000, // e.g. 1 second buffer
 });
-recorder.onAudioReady(({ buffer }) => {
+
+// Pass ONLY the callback to onAudioReady
+recorder.onAudioReady((chunk) => {
   // Insert the audio into the streaming transcription
-  model.streamInsert(buffer.getChannelData(0));
+  model.streamInsert(chunk.buffer.getChannelData(0));
 });
-recorder.start();
+
+await recorder.start();
 
 // Start streaming transcription
 try {
-  let transcription = '';
-  for await (const { committed, nonCommitted } of model.stream()) {
-    console.log('Streaming transcription:', { committed, nonCommitted });
-    transcription += committed;
+  let finalTranscription = '';
+
+  // Use 'verbose' flag for timestamps/segments
+  const streamIter = model.stream({ verbose: true });
+
+  for await (const { committed, nonCommitted } of streamIter) {
+    // Note: committed/nonCommitted are objects { text, segments } now
+    console.log('Committed Text:', committed.text);
+    console.log('Live Text:', nonCommitted.text);
+
+    if (committed.text) {
+      finalTranscription += committed.text;
+    }
   }
-  console.log('Final transcription:', transcription);
+  console.log('Final transcription:', finalTranscription);
 } catch (error) {
   console.error('Error during streaming transcription:', error);
 }
diff --git a/docs/docs/06-api-reference/classes/SpeechToTextModule.md b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
index e55f32e0f..27e019017 100644
--- a/docs/docs/06-api-reference/classes/SpeechToTextModule.md
+++ b/docs/docs/06-api-reference/classes/SpeechToTextModule.md
@@ -1,6 +1,6 @@
 # Class: SpeechToTextModule
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:11](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L11)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:15](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L15)
 
 Module for Speech to Text (STT) functionalities.
 
@@ -20,7 +20,7 @@ Module for Speech to Text (STT) functionalities.
 
 > **decode**(`tokens`, `encoderOutput`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:87](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L87)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:85](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L85)
 
 Runs the decoder of the model.
 
@@ -50,7 +50,7 @@ Decoded output.
 
 > **delete**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:65](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L65)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:63](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L63)
 
 Unloads the model from memory.
 
@@ -64,7 +64,7 @@ Unloads the model from memory.
 
 > **encode**(`waveform`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:76](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L76)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:74](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L74)
 
 Runs the encoding part of the model on the provided waveform.
 Returns the encoded waveform as a Float32Array.
@@ -89,7 +89,7 @@ The encoded output.
 
 > **load**(`model`, `onDownloadProgressCallback`): `Promise`\<`void`\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:28](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L28)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:26](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L26)
 
 Loads the model specified by the config object.
 `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
@@ -116,9 +116,9 @@ Optional callback to monitor download progress.
 
 ### stream()
 
-> **stream**(`options`): `AsyncGenerator`\<\{ `committed`: `string`; `nonCommitted`: `string`; \}\>
+> **stream**(`options`): `AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); \}\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:129](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L129)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:127](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L127)
 
 Starts a streaming transcription session.
 Yields objects with `committed` and `nonCommitted` transcriptions.
@@ -138,7 +138,7 @@ Decoding options including language.
 
 #### Returns
 
-`AsyncGenerator`\<\{ `committed`: `string`; `nonCommitted`: `string`; \}\>
+`AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](../interfaces/TranscriptionResult.md); \}\>
 
 An async generator yielding transcription updates.
 
@@ -148,7 +148,7 @@ An async generator yielding transcription updates.
 
 > **streamInsert**(`waveform`): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:189](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L189)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:200](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L200)
 
 Inserts a new audio chunk into the streaming transcription session.
 
@@ -170,7 +170,7 @@ The audio chunk to insert.
 
 > **streamStop**(): `void`
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:196](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L196)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:207](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L207)
 
 Stops the current streaming transcription session.
 
@@ -182,9 +182,9 @@ Stops the current streaming transcription session.
 
 ### transcribe()
 
-> **transcribe**(`waveform`, `options`): `Promise`\<`string`\>
+> **transcribe**(`waveform`, `options`): `Promise`\<[`TranscriptionResult`](../interfaces/TranscriptionResult.md)\>
 
-Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:105](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L105)
+Defined in: [packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts:103](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts#L103)
 
 Starts a transcription process for a given input array (16kHz waveform).
 For multilingual models, specify the language in `options`.
@@ -206,6 +206,6 @@ Decoding options including language.
 
 #### Returns
 
-`Promise`\<`string`\>
+`Promise`\<[`TranscriptionResult`](../interfaces/TranscriptionResult.md)\>
 
 The transcription string.
diff --git a/docs/docs/06-api-reference/functions/useSpeechToText.md b/docs/docs/06-api-reference/functions/useSpeechToText.md
index 45d52f72f..1ed320f41 100644
--- a/docs/docs/06-api-reference/functions/useSpeechToText.md
+++ b/docs/docs/06-api-reference/functions/useSpeechToText.md
@@ -2,7 +2,7 @@
 
 > **useSpeechToText**(`speechToTextProps`): [`SpeechToTextType`](../interfaces/SpeechToTextType.md)
 
-Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:18](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L18)
+Defined in: [packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts:19](https://github.com/software-mansion/react-native-executorch/blob/8e49949c5100b71cc725c99d4763cad5cf576fb0/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts#L19)
 
 React hook for managing a Speech to Text (STT) instance.
 
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md b/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
index cae802077..5f55e2276 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextModelConfig.md
@@ -1,6 +1,6 @@
 # Interface: SpeechToTextModelConfig
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:205](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L205)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:266](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L266)
 
 Configuration for Speech to Text model.
 
@@ -10,7 +10,7 @@ Configuration for Speech to Text model.
 
 > **decoderSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:219](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L219)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:280](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L280)
 
 A string that specifies the location of a `.pte` file for the decoder.
 
@@ -20,7 +20,7 @@ A string that specifies the location of a `.pte` file for the decoder.
 
 > **encoderSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:214](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L214)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:275](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L275)
 
 A string that specifies the location of a `.pte` file for the encoder.
 
@@ -30,7 +30,7 @@ A string that specifies the location of a `.pte` file for the encoder.
 
 > **isMultilingual**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:209](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L209)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:270](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L270)
 
 A boolean flag indicating whether the model supports multiple languages.
 
@@ -40,6 +40,6 @@ A boolean flag indicating whether the model supports multiple languages.
 
 > **tokenizerSource**: [`ResourceSource`](../type-aliases/ResourceSource.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:224](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L224)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:285](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L285)
 
 A string that specifies the location to the tokenizer for the model.
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md b/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
index 8d79cd419..9ef282a9f 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextProps.md
@@ -1,6 +1,6 @@
 # Interface: SpeechToTextProps
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:9](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L9)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:9](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L9)
 
 Configuration for Speech to Text model.
 
@@ -10,7 +10,7 @@ Configuration for Speech to Text model.
 
 > **model**: [`SpeechToTextModelConfig`](SpeechToTextModelConfig.md)
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:13](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L13)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:13](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L13)
 
 Configuration object containing model sources.
 
@@ -20,6 +20,6 @@ Configuration object containing model sources.
 
 > `optional` **preventLoad**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:17](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L17)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:17](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L17)
 
 Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
diff --git a/docs/docs/06-api-reference/interfaces/SpeechToTextType.md b/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
index 79b73e2db..ae6ee03fd 100644
--- a/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
+++ b/docs/docs/06-api-reference/interfaces/SpeechToTextType.md
@@ -1,27 +1,16 @@
 # Interface: SpeechToTextType
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:25](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L25)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:25](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L25)
 
 React hook for managing Speech to Text (STT) instance.
 
 ## Properties
 
-### committedTranscription
-
-> **committedTranscription**: `string`
-
-Defined in: [packages/react-native-executorch/src/types/stt.ts:50](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L50)
-
-Contains the part of the transcription that is finalized and will not change.
-Useful for displaying stable results during streaming.
-
----
-
 ### downloadProgress
 
 > **downloadProgress**: `number`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:44](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L44)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:44](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L44)
 
 Tracks the progress of the model download process.
 
@@ -31,7 +20,7 @@ Tracks the progress of the model download process.
 
 > **error**: [`RnExecutorchError`](../classes/RnExecutorchError.md) \| `null`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:29](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L29)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:29](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L29)
 
 Contains the error message if the model failed to load.
 
@@ -41,7 +30,7 @@ Contains the error message if the model failed to load.
 
 > **isGenerating**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:39](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L39)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:39](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L39)
 
 Indicates whether the model is currently processing an inference.
 
@@ -51,28 +40,17 @@ Indicates whether the model is currently processing an inference.
 
 > **isReady**: `boolean`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:34](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L34)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:34](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L34)
 
 Indicates whether the model has successfully loaded and is ready for inference.
 
----
-
-### nonCommittedTranscription
-
-> **nonCommittedTranscription**: `string`
-
-Defined in: [packages/react-native-executorch/src/types/stt.ts:56](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L56)
-
-Contains the part of the transcription that is still being processed and may change.
-Useful for displaying live, partial results during streaming.
-
 ## Methods
 
 ### decode()
 
 > **decode**(`tokens`, `encoderOutput`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:71](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L71)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:59](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L59)
 
 Runs the decoder of the model.
 
@@ -102,7 +80,7 @@ A promise resolving to the decoded text.
 
 > **encode**(`waveform`): `Promise`\<`Float32Array`\<`ArrayBufferLike`\>\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:63](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L63)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:51](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L51)
 
 Runs the encoding part of the model on the provided waveform.
 
@@ -124,9 +102,9 @@ A promise resolving to the encoded data.
 
 ### stream()
 
-> **stream**(`options?`): `Promise`\<`string`\>
+> **stream**(`options?`): `AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](TranscriptionResult.md); \}, `void`, `unknown`\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:94](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L94)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:84](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L84)
 
 Starts a streaming transcription process.
 Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
@@ -142,9 +120,10 @@ Decoding options including language.
 
 #### Returns
 
-`Promise`\<`string`\>
+`AsyncGenerator`\<\{ `committed`: [`TranscriptionResult`](TranscriptionResult.md); `nonCommitted`: [`TranscriptionResult`](TranscriptionResult.md); \}, `void`, `unknown`\>
 
-The final transcription string.
+Asynchronous generator that returns `committed` and `nonCommitted` transcription.
+Both `committed` and `nonCommitted` are of type `TranscriptionResult`
 
 ---
 
@@ -152,7 +131,7 @@ The final transcription string.
 
 > **streamInsert**(`waveform`): `void`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:100](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L100)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:97](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L97)
 
 Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
 
@@ -174,7 +153,7 @@ The audio chunk to insert.
 
 > **streamStop**(): `void`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:105](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L105)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:102](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L102)
 
 Stops the ongoing streaming transcription process.
 
@@ -186,9 +165,9 @@ Stops the ongoing streaming transcription process.
 
 ### transcribe()
 
-> **transcribe**(`waveform`, `options?`): `Promise`\<`string`\>
+> **transcribe**(`waveform`, `options?`): `Promise`\<[`TranscriptionResult`](TranscriptionResult.md)\>
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:82](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L82)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:71](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L71)
 
 Starts a transcription process for a given input array, which should be a waveform at 16kHz.
 
@@ -204,10 +183,11 @@ The input audio waveform.
 
 [`DecodingOptions`](DecodingOptions.md)
 
-Decoding options, e.g. `{ language: 'es' }` for multilingual models.
+Decoding options, check API reference for more details.
 
 #### Returns
 
-`Promise`\<`string`\>
+`Promise`\<[`TranscriptionResult`](TranscriptionResult.md)\>
 
-Resolves a promise with the output transcription when the model is finished.
+Resolves a promise with the output transcription. Result of transcription is
+object of type `TranscriptionResult`.
diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionResult.md b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
new file mode 100644
index 000000000..260af8b40
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionResult.md
@@ -0,0 +1,56 @@
+# Interface: TranscriptionResult
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:253](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L253)
+
+Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
+
+## Properties
+
+### duration
+
+> **duration**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:256](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L256)
+
+Duration in seconds of a given transcription.
+
+---
+
+### language
+
+> **language**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:255](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L255)
+
+Language chosen for transcription.
+
+---
+
+### segments?
+
+> `optional` **segments**: [`TranscriptionSegment`](TranscriptionSegment.md)[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:258](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L258)
+
+If `verbose` set to `true` in `DecodingOptions`, it contains array of
+`TranscriptionSegment` with details split into separate transcription segments.
+
+---
+
+### task?
+
+> `optional` **task**: `"transcribe"` \| `"stream"`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:254](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L254)
+
+String indicating task, either 'transcribe' or 'stream'.
+
+---
+
+### text
+
+> **text**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:257](https://github.com/software-mansion/react-native-executorch/blob/8478203d045f8c7f4cbcb8440eb10b6abf947ca7/packages/react-native-executorch/src/types/stt.ts#L257)
+
+The whole text of a transcription as a `string`.
diff --git a/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
new file mode 100644
index 000000000..10246ac43
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/TranscriptionSegment.md
@@ -0,0 +1,86 @@
+# Interface: TranscriptionSegment
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:228](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L228)
+
+Structure that represent single Segment of transcription.
+
+## Properties
+
+### avgLogprob
+
+> **avgLogprob**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:235](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L235)
+
+Average log probability calculated across all tokens in a segment.
+
+---
+
+### compressionRatio
+
+> **compressionRatio**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:236](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L236)
+
+Compression ration achieved on a given segment.
+
+---
+
+### end
+
+> **end**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:230](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L230)
+
+Timestamp of the end of the segment in audio (in seconds).
+
+---
+
+### start
+
+> **start**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:229](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L229)
+
+Timestamp of the beginning of the segment in audio (in seconds).
+
+---
+
+### temperature
+
+> **temperature**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:234](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L234)
+
+Temperature for which given segment was computed.
+
+---
+
+### text
+
+> **text**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:231](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L231)
+
+Full text of the given segment as a string.
+
+---
+
+### tokens
+
+> **tokens**: `number`[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:233](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L233)
+
+Raw tokens represented as table of integers.
+
+---
+
+### words?
+
+> `optional` **words**: [`Word`](Word.md)[]
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:232](https://github.com/software-mansion/react-native-executorch/blob/4b38855a54ece5e2d2b36eaeb20f089a57ea3288/packages/react-native-executorch/src/types/stt.ts#L232)
+
+If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
+as an array of `Word`.
diff --git a/docs/docs/06-api-reference/interfaces/Word.md b/docs/docs/06-api-reference/interfaces/Word.md
new file mode 100644
index 000000000..9ffe5fcd1
--- /dev/null
+++ b/docs/docs/06-api-reference/interfaces/Word.md
@@ -0,0 +1,35 @@
+# Interface: Word
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:208](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L208)
+
+Structure that represent single token with timestamp information.
+
+## Properties
+
+### end
+
+> **end**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:211](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L211)
+
+Timestamp of the end of the token in audio (in seconds).
+
+---
+
+### start
+
+> **start**: `number`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:210](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L210)
+
+Timestamp of the beginning of the token in audio (in seconds).
+
+---
+
+### word
+
+> **word**: `string`
+
+Defined in: [packages/react-native-executorch/src/types/stt.ts:209](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L209)
+
+Token as a string value.
diff --git a/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md b/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
index b60e8c82e..0caedbc77 100644
--- a/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
+++ b/docs/docs/06-api-reference/type-aliases/SpeechToTextLanguage.md
@@ -2,6 +2,6 @@
 
 > **SpeechToTextLanguage** = `"af"` \| `"sq"` \| `"ar"` \| `"hy"` \| `"az"` \| `"eu"` \| `"be"` \| `"bn"` \| `"bs"` \| `"bg"` \| `"my"` \| `"ca"` \| `"zh"` \| `"hr"` \| `"cs"` \| `"da"` \| `"nl"` \| `"et"` \| `"en"` \| `"fi"` \| `"fr"` \| `"gl"` \| `"ka"` \| `"de"` \| `"el"` \| `"gu"` \| `"ht"` \| `"he"` \| `"hi"` \| `"hu"` \| `"is"` \| `"id"` \| `"it"` \| `"ja"` \| `"kn"` \| `"kk"` \| `"km"` \| `"ko"` \| `"lo"` \| `"lv"` \| `"lt"` \| `"mk"` \| `"mg"` \| `"ms"` \| `"ml"` \| `"mt"` \| `"mr"` \| `"ne"` \| `"no"` \| `"fa"` \| `"pl"` \| `"pt"` \| `"pa"` \| `"ro"` \| `"ru"` \| `"sr"` \| `"si"` \| `"sk"` \| `"sl"` \| `"es"` \| `"su"` \| `"sw"` \| `"sv"` \| `"tl"` \| `"tg"` \| `"ta"` \| `"te"` \| `"th"` \| `"tr"` \| `"uk"` \| `"ur"` \| `"uz"` \| `"vi"` \| `"cy"` \| `"yi"`
 
-Defined in: [packages/react-native-executorch/src/types/stt.ts:113](https://github.com/software-mansion/react-native-executorch/blob/326d6344894d75625c600d5988666e215a32d466/packages/react-native-executorch/src/types/stt.ts#L113)
+Defined in: [packages/react-native-executorch/src/types/stt.ts:110](https://github.com/software-mansion/react-native-executorch/blob/dc9a5617585ba60b2224b30bbe71a79b0f4e44d2/packages/react-native-executorch/src/types/stt.ts#L110)
 
 Languages supported by whisper (not whisper.en)
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 2baf922db..df9abbdef 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -18,8 +18,12 @@
 #include <rnexecutorch/models/object_detection/Constants.h>
 #include <rnexecutorch/models/object_detection/Types.h>
 #include <rnexecutorch/models/ocr/Types.h>
+#include <rnexecutorch/models/speech_to_text/types/Segment.h>
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 #include <rnexecutorch/models/voice_activity_detection/Types.h>
 
+using namespace rnexecutorch::models::speech_to_text::types;
+
 namespace rnexecutorch::jsi_conversion {
 
 using namespace facebook;
@@ -483,4 +487,67 @@ getJsiValue(const std::vector<models::voice_activity_detection::types::Segment>
   return jsiSegments;
 }
 
+inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "start", seg.start);
+  obj.setProperty(runtime, "end", seg.end);
+
+  std::string segText;
+  for (const auto &w : seg.words)
+    segText += w.content;
+  obj.setProperty(runtime, "text",
+                  jsi::String::createFromUtf8(runtime, segText));
+
+  obj.setProperty(runtime, "avgLogprob", seg.avgLogprob);
+  obj.setProperty(runtime, "compressionRatio", seg.compressionRatio);
+  obj.setProperty(runtime, "temperature", seg.temperature);
+
+  jsi::Array wordsArray(runtime, seg.words.size());
+  for (size_t i = 0; i < seg.words.size(); ++i) {
+    jsi::Object wordObj(runtime);
+    wordObj.setProperty(
+        runtime, "word",
+        jsi::String::createFromUtf8(runtime, seg.words[i].content));
+    wordObj.setProperty(runtime, "start",
+                        static_cast<double>(seg.words[i].start));
+    wordObj.setProperty(runtime, "end", static_cast<double>(seg.words[i].end));
+
+    wordsArray.setValueAtIndex(runtime, i, wordObj);
+  }
+  obj.setProperty(runtime, "words", wordsArray);
+
+  jsi::Array tokensArray(runtime, seg.tokens.size());
+  for (size_t i = 0; i < seg.tokens.size(); ++i) {
+    tokensArray.setValueAtIndex(runtime, i, static_cast<double>(seg.tokens[i]));
+  }
+  obj.setProperty(runtime, "tokens", tokensArray);
+
+  return obj;
+}
+
+inline jsi::Value getJsiValue(const TranscriptionResult &result,
+                              jsi::Runtime &runtime) {
+  jsi::Object obj(runtime);
+  obj.setProperty(runtime, "text",
+                  jsi::String::createFromUtf8(runtime, result.text));
+
+  if (!result.segments.empty() || !result.language.empty()) {
+    obj.setProperty(runtime, "task",
+                    jsi::String::createFromUtf8(runtime, result.task));
+    if (!result.language.empty()) {
+      obj.setProperty(runtime, "language",
+                      jsi::String::createFromUtf8(runtime, result.language));
+    }
+    obj.setProperty(runtime, "duration", result.duration);
+
+    jsi::Array segmentsArray(runtime, result.segments.size());
+    for (size_t i = 0; i < result.segments.size(); ++i) {
+      segmentsArray.setValueAtIndex(runtime, i,
+                                    getJsiValue(result.segments[i], runtime));
+    }
+    obj.setProperty(runtime, "segments", segmentsArray);
+  }
+
+  return obj;
+}
 } // namespace rnexecutorch::jsi_conversion
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 6299c9c40..eef6d562c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -3,6 +3,7 @@
 #include "SpeechToText.h"
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/ErrorCodes.h>
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 
 namespace rnexecutorch::models::speech_to_text {
 
@@ -43,27 +44,29 @@ SpeechToText::decode(std::span<uint64_t> tokens,
   return std::make_shared<OwningArrayBuffer>(decoderOutput);
 }
 
-std::vector<char> SpeechToText::transcribe(std::span<float> waveform,
-                                           std::string languageOption) const {
-  std::vector<Segment> segments =
-      this->asr->transcribe(waveform, DecodingOptions(languageOption));
-  std::string transcription;
+TranscriptionResult SpeechToText::transcribe(std::span<float> waveform,
+                                             std::string languageOption,
+                                             bool verbose) const {
+  DecodingOptions options(languageOption, verbose);
+  std::vector<Segment> segments = this->asr->transcribe(waveform, options);
 
-  size_t transcriptionLength = 0;
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcriptionLength += word.content.size();
-    }
+  std::string fullText;
+  for (const auto &segment : segments) {
+    for (const auto &word : segment.words)
+      fullText += word.content;
   }
-  transcription.reserve(transcriptionLength);
 
-  for (auto &segment : segments) {
-    for (auto &word : segment.words) {
-      transcription += word.content;
-    }
+  TranscriptionResult result;
+  result.text = fullText;
+  result.task = "transcribe";
+
+  if (verbose) {
+    result.language = languageOption.empty() ? "english" : languageOption;
+    result.duration = static_cast<double>(waveform.size()) / 16000.0;
+    result.segments = std::move(segments);
   }
 
-  return {transcription.begin(), transcription.end()};
+  return result;
 }
 
 size_t SpeechToText::getMemoryLowerBound() const noexcept {
@@ -71,44 +74,85 @@ size_t SpeechToText::getMemoryLowerBound() const noexcept {
          this->decoder->getMemoryLowerBound();
 }
 
+namespace {
+TranscriptionResult wordsToResult(const std::vector<Word> &words,
+                                  const std::string &language, bool verbose) {
+  TranscriptionResult res;
+  res.language = language;
+  res.task = "stream";
+
+  std::string fullText;
+  for (const auto &w : words) {
+    fullText += w.content;
+  }
+  res.text = fullText;
+
+  if (verbose && !words.empty()) {
+    Segment seg;
+    seg.start = words.front().start;
+    seg.end = words.back().end;
+    seg.words = words;
+    seg.avgLogprob = std::nanf("0");
+    seg.compressionRatio = std::nanf("0");
+    seg.temperature = std::nanf("0");
+
+    res.segments.push_back(std::move(seg));
+  }
+
+  return res;
+}
+} // namespace
+
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption) {
+                          std::string languageOption, bool verbose) {
   if (this->isStreaming) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
   }
 
-  auto nativeCallback =
-      [this, callback](const std::vector<char> &committedVec,
-                       const std::vector<char> &nonCommittedVec, bool isDone) {
-        this->callInvoker->invokeAsync([callback, committedVec, nonCommittedVec,
-                                        isDone](jsi::Runtime &rt) {
-          callback->call(
-              rt, rnexecutorch::jsi_conversion::getJsiValue(committedVec, rt),
-              rnexecutorch::jsi_conversion::getJsiValue(nonCommittedVec, rt),
-              jsi::Value(isDone));
+  auto nativeCallback = [this, callback,
+                         verbose](const TranscriptionResult &committed,
+                                  const TranscriptionResult &nonCommitted,
+                                  bool isDone) {
+    // This moves execution to the JS thread
+    this->callInvoker->invokeAsync(
+        [callback, committed, nonCommitted, isDone, verbose](jsi::Runtime &rt) {
+          jsi::Value jsiCommitted =
+              rnexecutorch::jsi_conversion::getJsiValue(committed, rt);
+          jsi::Value jsiNonCommitted =
+              rnexecutorch::jsi_conversion::getJsiValue(nonCommitted, rt);
+
+          callback->call(rt, std::move(jsiCommitted),
+                         std::move(jsiNonCommitted), jsi::Value(isDone));
         });
-      };
+  };
 
   this->isStreaming = true;
+  DecodingOptions options(languageOption, verbose);
+
   while (this->isStreaming) {
     if (!this->readyToProcess ||
         this->processor->audioBuffer.size() < SpeechToText::kMinAudioSamples) {
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       continue;
     }
-    ProcessResult res =
-        this->processor->processIter(DecodingOptions(languageOption));
 
-    nativeCallback({res.committed.begin(), res.committed.end()},
-                   {res.nonCommitted.begin(), res.nonCommitted.end()}, false);
+    ProcessResult res = this->processor->processIter(options);
+
+    TranscriptionResult cRes =
+        wordsToResult(res.committed, languageOption, verbose);
+    TranscriptionResult ncRes =
+        wordsToResult(res.nonCommitted, languageOption, verbose);
+
+    nativeCallback(cRes, ncRes, false);
     this->readyToProcess = false;
   }
 
-  std::string committed = this->processor->finish();
-
-  nativeCallback({committed.begin(), committed.end()}, {}, true);
+  std::vector<Word> finalWords = this->processor->finish();
+  TranscriptionResult finalRes =
+      wordsToResult(finalWords, languageOption, verbose);
 
+  nativeCallback(finalRes, {}, true);
   this->resetStreamState();
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index e206f6ca7..f9156040d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h"
+#include <rnexecutorch/models/speech_to_text/types/TranscriptionResult.h>
 #include <span>
 #include <string>
 #include <vector>
@@ -23,14 +24,20 @@ class SpeechToText {
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   decode(std::span<uint64_t> tokens, std::span<float> encoderOutput) const;
-  [[nodiscard("Registered non-void function")]] std::vector<char>
-  transcribe(std::span<float> waveform, std::string languageOption) const;
+  [[nodiscard("Registered non-void function")]]
+  types::TranscriptionResult transcribe(std::span<float> waveform,
+                                        std::string languageOption,
+                                        bool verbose) const;
+
+  [[nodiscard("Registered non-void function")]]
+  std::vector<char> transcribeStringOnly(std::span<float> waveform,
+                                         std::string languageOption) const;
 
   size_t getMemoryLowerBound() const noexcept;
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption);
+              std::string languageOption, bool enableTimestamps);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
index 64c63e518..2ed41ff22 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp
@@ -97,6 +97,9 @@ ASR::generateWithFallback(std::span<float> waveform,
                           const DecodingOptions &options) const {
   std::vector<float> temperatures = {0.0f, 0.2f, 0.4f, 0.6f, 0.8f, 1.0f};
   std::vector<uint64_t> bestTokens;
+  float bestAvgLogProb = -std::numeric_limits<float>::infinity();
+  float bestCompressionRatio = 0.0f;
+  float bestTemperature = 0.0f;
 
   for (auto t : temperatures) {
     auto [tokens, scores] = this->generate(waveform, t, options);
@@ -111,16 +114,30 @@ ASR::generateWithFallback(std::span<float> waveform,
 
     if (avgLogProb >= -1.0f && compressionRatio < 2.4f) {
       bestTokens = std::move(tokens);
+      bestAvgLogProb = avgLogProb;
+      bestCompressionRatio = compressionRatio;
+      bestTemperature = t;
       break;
     }
+
+    if (t == temperatures.back() && bestTokens.empty()) {
+      bestTokens = std::move(tokens);
+      bestAvgLogProb = avgLogProb;
+      bestCompressionRatio = compressionRatio;
+      bestTemperature = t;
+    }
   }
 
-  return this->calculateWordLevelTimestamps(bestTokens, waveform);
+  return this->calculateWordLevelTimestamps(bestTokens, waveform,
+                                            bestAvgLogProb, bestTemperature,
+                                            bestCompressionRatio);
 }
 
 std::vector<Segment>
 ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
-                                  const std::span<const float> waveform) const {
+                                  const std::span<const float> waveform,
+                                  float avgLogProb, float temperature,
+                                  float compressionRatio) const {
   const size_t generatedTokensSize = generatedTokens.size();
   if (generatedTokensSize < 2 ||
       generatedTokens[generatedTokensSize - 1] !=
@@ -142,7 +159,22 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
       const uint64_t end = generatedTokens[i - 1];
       auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
       if (words.size()) {
-        segments.emplace_back(std::move(words), 0.0);
+        Segment seg;
+        seg.words = std::move(words);
+        seg.tokens = {};
+        seg.avgLogprob = avgLogProb;
+        seg.temperature = temperature;
+        seg.compressionRatio = compressionRatio;
+
+        if (!seg.words.empty()) {
+          seg.start = seg.words.front().start;
+          seg.end = seg.words.back().end;
+        } else {
+          seg.start = 0.0;
+          seg.end = 0.0;
+        }
+
+        segments.push_back(std::move(seg));
       }
       tokens.clear();
       prevTimestamp = generatedTokens[i];
@@ -153,10 +185,20 @@ ASR::calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
   const uint64_t end = generatedTokens[generatedTokensSize - 2];
   auto words = this->estimateWordLevelTimestampsLinear(tokens, start, end);
 
-  if (words.size()) {
-    segments.emplace_back(std::move(words), 0.0);
+  Segment seg;
+  seg.words = std::move(words);
+  seg.tokens = tokens;
+  seg.avgLogprob = avgLogProb;
+  seg.temperature = temperature;
+  seg.compressionRatio = compressionRatio;
+
+  if (!seg.words.empty()) {
+    seg.start = seg.words.front().start;
+    seg.end = seg.words.back().end;
   }
 
+  segments.push_back(std::move(seg));
+
   float scalingFactor =
       static_cast<float>(waveform.size()) /
       (ASR::kSamplingRate * (end - this->timestampBeginToken) *
@@ -216,7 +258,8 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
   while (std::cmp_less(seek * ASR::kSamplingRate, waveform.size())) {
     int32_t start = seek * ASR::kSamplingRate;
     const auto end = std::min<int32_t>(
-        (seek + ASR::kChunkSize) * ASR::kSamplingRate, waveform.size());
+        static_cast<int32_t>((seek + ASR::kChunkSize) * ASR::kSamplingRate),
+        static_cast<int32_t>(waveform.size()));
     auto chunk = waveform.subspan(start, end - start);
 
     if (std::cmp_less(chunk.size(), ASR::kMinChunkSamples)) {
@@ -235,9 +278,18 @@ std::vector<Segment> ASR::transcribe(std::span<float> waveform,
         w.start += seek;
         w.end += seek;
       }
+
+      seg.start += seek;
+      seg.end += seek;
+    }
+
+    while (!segments.empty() && segments.back().words.empty()) {
+      segments.pop_back();
     }
 
-    seek = static_cast<int32_t>(segments.back().words.back().end);
+    if (!segments.empty() && !segments.back().words.empty()) {
+      seek = static_cast<int32_t>(segments.back().words.back().end);
+    }
     results.insert(results.end(), std::make_move_iterator(segments.begin()),
                    std::make_move_iterator(segments.end()));
   }
@@ -290,7 +342,7 @@ std::vector<float> ASR::decode(std::span<const uint64_t> tokens,
   }
 
   const auto logitsTensor = decoderResult.get().at(0).toTensor();
-  const int32_t outputNumel = logitsTensor.numel();
+  const int32_t outputNumel = static_cast<int32_t>(logitsTensor.numel());
 
   const size_t innerDim = logitsTensor.size(1);
   const size_t dictSize = logitsTensor.size(2);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
index 8cdbd5522..16a2f45e6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -51,9 +51,11 @@ class ASR {
   std::vector<types::Segment>
   generateWithFallback(std::span<float> waveform,
                        const types::DecodingOptions &options) const;
-  std::vector<types::Segment>
-  calculateWordLevelTimestamps(std::span<const uint64_t> tokens,
-                               std::span<const float> waveform) const;
+  std::vector<Segment>
+  calculateWordLevelTimestamps(std::span<const uint64_t> generatedTokens,
+                               const std::span<const float> waveform,
+                               float avgLogProb, float temperature,
+                               float compressionRatio) const;
   std::vector<types::Word>
   estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
                                     uint64_t start, uint64_t end) const;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
index c6a99e9a2..3137d274b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp
@@ -34,8 +34,14 @@ ProcessResult OnlineASRProcessor::processIter(const DecodingOptions &options) {
     chunkCompletedSegment(res);
   }
 
+  auto move_to_vector = [](auto& container) {
+      return std::vector<Word>(std::make_move_iterator(container.begin()),
+                              std::make_move_iterator(container.end()));
+  };
+
   std::deque<Word> nonCommittedWords = this->hypothesisBuffer.complete();
-  return {this->toFlush(flushed), this->toFlush(nonCommittedWords)};
+
+  return { move_to_vector(flushed), move_to_vector(nonCommittedWords) };
 }
 
 void OnlineASRProcessor::chunkCompletedSegment(std::span<const Segment> res) {
@@ -77,23 +83,14 @@ void OnlineASRProcessor::chunkAt(float time) {
   this->bufferTimeOffset = time;
 }
 
-std::string OnlineASRProcessor::finish() {
-  const std::deque<Word> buffer = this->hypothesisBuffer.complete();
-  std::string committedText = this->toFlush(buffer);
+std::vector<Word> OnlineASRProcessor::finish() {
+  std::deque<Word> bufferDeq = this->hypothesisBuffer.complete();
+  std::vector<Word> buffer(std::make_move_iterator(bufferDeq.begin()),
+                           std::make_move_iterator(bufferDeq.end()));
+
   this->bufferTimeOffset += static_cast<float>(audioBuffer.size()) /
                             OnlineASRProcessor::kSamplingRate;
-  return committedText;
-}
-
-std::string OnlineASRProcessor::toFlush(const std::deque<Word> &words) const {
-  std::string text;
-  text.reserve(std::accumulate(
-      words.cbegin(), words.cend(), 0,
-      [](size_t sum, const Word &w) { return sum + w.content.size(); }));
-  for (const auto &word : words) {
-    text.append(word.content);
-  }
-  return text;
+  return buffer;
 }
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
index c50b56271..98944bdbe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h
@@ -3,6 +3,7 @@
 #include "rnexecutorch/models/speech_to_text/asr/ASR.h"
 #include "rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h"
 #include "rnexecutorch/models/speech_to_text/types/ProcessResult.h"
+#include "rnexecutorch/models/speech_to_text/types/Word.h"
 
 namespace rnexecutorch::models::speech_to_text::stream {
 
@@ -12,7 +13,7 @@ class OnlineASRProcessor {
 
   void insertAudioChunk(std::span<const float> audio);
   types::ProcessResult processIter(const types::DecodingOptions &options);
-  std::string finish();
+  std::vector<types::Word> finish();
 
   std::vector<float> audioBuffer;
 
@@ -26,8 +27,6 @@ class OnlineASRProcessor {
 
   void chunkCompletedSegment(std::span<const types::Segment> res);
   void chunkAt(float time);
-
-  std::string toFlush(const std::deque<types::Word> &words) const;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
index c351ddc55..99774cf52 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h
@@ -6,10 +6,12 @@
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct DecodingOptions {
-  explicit DecodingOptions(const std::string &language)
-      : language(language.empty() ? std::nullopt : std::optional(language)) {}
+  explicit DecodingOptions(const std::string &language, bool verbose = false)
+      : language(language.empty() ? std::nullopt : std::optional(language)),
+        verbose(verbose) {}
 
   std::optional<std::string> language;
+  bool verbose;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
index 0cb05e5a6..681495e2a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h
@@ -5,8 +5,8 @@
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct ProcessResult {
-  std::string committed;
-  std::string nonCommitted;
+  std::vector<Word> committed;
+  std::vector<Word> nonCommitted;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
index 5b8368fe4..b673cbe6e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/Segment.h
@@ -1,14 +1,18 @@
 #pragma once
 
-#include <vector>
-
 #include "Word.h"
+#include <vector>
 
 namespace rnexecutorch::models::speech_to_text::types {
 
 struct Segment {
   std::vector<Word> words;
-  float noSpeechProbability;
+  std::vector<uint64_t> tokens; // Raw token IDs
+  float start;
+  float end;
+  float avgLogprob;
+  float temperature;
+  float compressionRatio;
 };
 
 } // namespace rnexecutorch::models::speech_to_text::types
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
new file mode 100644
index 000000000..5fe3868da
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/types/TranscriptionResult.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "Segment.h"
+#include <string>
+#include <vector>
+
+namespace rnexecutorch::models::speech_to_text::types {
+
+struct TranscriptionResult {
+  std::string text;
+  std::string task;
+  std::string language;
+  double duration = 0.0;
+  std::vector<Segment> segments; // Populated only if verbose=true
+};
+
+} // namespace rnexecutorch::models::speech_to_text::types
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
index b9a1d884c..5b15c0040 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/SpeechToTextTest.cpp
@@ -32,7 +32,7 @@ template <> struct ModelTraits<SpeechToText> {
 
   static void callGenerate(ModelType &model) {
     auto audio = test_utils::loadAudioFromFile("test_audio_float.raw");
-    (void)model.transcribe(audio, "en");
+    (void)model.transcribe(audio, "en", false);
   }
 };
 } // namespace model_tests
@@ -53,7 +53,7 @@ TEST(S2TCtorTests, InvalidDecoderPathThrows) {
 TEST(S2TCtorTests, InvalidTokenizerPathThrows) {
   EXPECT_THROW(SpeechToText(kValidEncoderPath, kValidDecoderPath,
                             "nonexistent.json", nullptr),
-               RnExecutorchError);
+               std::filesystem::filesystem_error);
 }
 
 TEST(S2TEncodeTests, EncodeReturnsNonNull) {
@@ -71,9 +71,13 @@ TEST(S2TTranscribeTests, TranscribeReturnsValidChars) {
                      nullptr);
   auto audio = loadAudioFromFile("test_audio_float.raw");
   ASSERT_FALSE(audio.empty());
-  auto result = model.transcribe(audio, "en");
-  ASSERT_FALSE(result.empty());
-  for (char c : result) {
+  auto result = model.transcribe(audio, "en", true);
+  ASSERT_EQ(result.language, "en");
+  EXPECT_GE(result.duration, 20.0f);
+  ASSERT_EQ(result.task, "transcription");
+  ASSERT_FALSE(result.segments.empty());
+  ASSERT_FALSE(result.text.empty());
+  for (char c : result.text) {
     EXPECT_GE(static_cast<unsigned char>(c), 0);
     EXPECT_LE(static_cast<unsigned char>(c), 127);
   }
@@ -83,8 +87,8 @@ TEST(S2TTranscribeTests, EmptyResultOnSilence) {
   SpeechToText model(kValidEncoderPath, kValidDecoderPath, kValidTokenizerPath,
                      nullptr);
   auto audio = generateSilence(16000 * 5);
-  auto result = model.transcribe(audio, "en");
-  EXPECT_TRUE(result.empty());
+  auto result = model.transcribe(audio, "en", false);
+  EXPECT_TRUE(result.text.empty());
 }
 
 TEST(S2TTranscribeTests, InvalidLanguageThrows) {
@@ -92,6 +96,6 @@ TEST(S2TTranscribeTests, InvalidLanguageThrows) {
                      nullptr);
   auto audio = loadAudioFromFile("test_audio_float.raw");
   ASSERT_FALSE(audio.empty());
-  EXPECT_THROW((void)model.transcribe(audio, "invalid_language_code"),
+  EXPECT_THROW((void)model.transcribe(audio, "invalid_language_code", false),
                RnExecutorchError);
 }
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 74734c35e..fa3d8c685 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -4,6 +4,7 @@ import {
   DecodingOptions,
   SpeechToTextType,
   SpeechToTextProps,
+  TranscriptionResult,
 } from '../../types/stt';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -24,34 +25,40 @@ export const useSpeechToText = ({
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
 
-  const [modelInstance] = useState(() => new SpeechToTextModule());
-  const [committedTranscription, setCommittedTranscription] = useState('');
-  const [nonCommittedTranscription, setNonCommittedTranscription] =
-    useState('');
+  const [moduleInstance, _] = useState(() => new SpeechToTextModule());
 
   useEffect(() => {
     if (preventLoad) return;
+    let isMounted = true;
+
     (async () => {
       setDownloadProgress(0);
       setError(null);
       try {
         setIsReady(false);
-        await modelInstance.load(
+        await moduleInstance.load(
           {
             isMultilingual: model.isMultilingual,
             encoderSource: model.encoderSource,
             decoderSource: model.decoderSource,
             tokenizerSource: model.tokenizerSource,
           },
-          setDownloadProgress
+          (progress) => {
+            if (isMounted) setDownloadProgress(progress);
+          }
         );
-        setIsReady(true);
+        if (isMounted) setIsReady(true);
       } catch (err) {
-        setError(parseUnknownError(err));
+        if (isMounted) setError(parseUnknownError(err));
       }
     })();
+
+    return () => {
+      isMounted = false;
+      moduleInstance.delete();
+    };
   }, [
-    modelInstance,
+    moduleInstance,
     model.isMultilingual,
     model.encoderSource,
     model.decoderSource,
@@ -59,87 +66,92 @@ export const useSpeechToText = ({
     preventLoad,
   ]);
 
-  const stateWrapper = useCallback(
-    <T extends (...args: any[]) => Promise<any>>(fn: T) =>
-      async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
-        if (!isReady)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
-          );
-        if (isGenerating)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModelGenerating,
-            'The model is currently generating. Please wait until previous model run is complete.'
-          );
-        setIsGenerating(true);
-        try {
-          return await fn.apply(modelInstance, args);
-        } finally {
-          setIsGenerating(false);
-        }
-      },
-    [isReady, isGenerating, modelInstance]
+  const transcribe = useCallback(
+    async (
+      waveform: Float32Array,
+      options: DecodingOptions = {}
+    ): Promise<TranscriptionResult> => {
+      if (!isReady) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'The model is currently not loaded. Please load the model before calling this function.'
+        );
+      }
+      if (isGenerating) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModelGenerating,
+          'The model is currently generating. Please wait until previous model run is complete.'
+        );
+      }
+
+      setIsGenerating(true);
+      try {
+        return await moduleInstance.transcribe(waveform, options);
+      } finally {
+        setIsGenerating(false);
+      }
+    },
+    [isReady, isGenerating, moduleInstance]
   );
 
   const stream = useCallback(
-    async (options?: DecodingOptions) => {
-      if (!isReady)
+    async function* (options: DecodingOptions = {}): AsyncGenerator<
+      {
+        committed: TranscriptionResult;
+        nonCommitted: TranscriptionResult;
+      },
+      void,
+      unknown
+    > {
+      if (!isReady) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModuleNotLoaded,
           'The model is currently not loaded. Please load the model before calling this function.'
         );
-      if (isGenerating)
+      }
+      if (isGenerating) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.ModelGenerating,
           'The model is currently generating. Please wait until previous model run is complete.'
         );
+      }
+
       setIsGenerating(true);
-      setCommittedTranscription('');
-      setNonCommittedTranscription('');
-      let transcription = '';
       try {
-        for await (const { committed, nonCommitted } of modelInstance.stream(
-          options
-        )) {
-          setCommittedTranscription((prev) => prev + committed);
-          setNonCommittedTranscription(nonCommitted);
-          transcription += committed;
+        const generator = moduleInstance.stream(options);
+        for await (const result of generator) {
+          yield result;
         }
       } finally {
         setIsGenerating(false);
       }
-      return transcription;
     },
-    [isReady, isGenerating, modelInstance]
+    [isReady, isGenerating, moduleInstance]
   );
 
-  const wrapper = useCallback(
-    <T extends (...args: any[]) => any>(fn: T) => {
-      return (...args: Parameters<T>): ReturnType<T> => {
-        if (!isReady)
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.ModuleNotLoaded,
-            'The model is currently not loaded. Please load the model before calling this function.'
-          );
-        return fn.apply(modelInstance, args);
-      };
+  const streamInsert = useCallback(
+    (waveform: Float32Array) => {
+      if (!isReady) return;
+      moduleInstance.streamInsert(waveform);
     },
-    [isReady, modelInstance]
+    [isReady, moduleInstance]
   );
 
+  const streamStop = useCallback(() => {
+    if (!isReady) return;
+    moduleInstance.streamStop();
+  }, [isReady, moduleInstance]);
+
   return {
     error,
     isReady,
     isGenerating,
     downloadProgress,
-    committedTranscription,
-    nonCommittedTranscription,
-    encode: stateWrapper(SpeechToTextModule.prototype.encode),
-    decode: stateWrapper(SpeechToTextModule.prototype.decode),
-    transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+    transcribe,
     stream,
-    streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
-    streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
+    streamInsert,
+    streamStop,
+    encode: moduleInstance.encode.bind(moduleInstance),
+    decode: moduleInstance.decode.bind(moduleInstance),
   };
 };
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index 4b4f196df..64f4e953f 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -1,4 +1,8 @@
-import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+import {
+  DecodingOptions,
+  SpeechToTextModelConfig,
+  TranscriptionResult,
+} from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -10,14 +14,8 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
  */
 export class SpeechToTextModule {
   private nativeModule: any;
-
   private modelConfig!: SpeechToTextModelConfig;
 
-  private textDecoder = new TextDecoder('utf-8', {
-    fatal: false,
-    ignoreBOM: true,
-  });
-
   /**
    * Loads the model specified by the config object.
    * `onDownloadProgressCallback` allows you to monitor the current progress of the model download.
@@ -63,7 +61,7 @@ export class SpeechToTextModule {
    * Unloads the model from memory.
    */
   public delete(): void {
-    this.nativeModule.unload();
+    this.nativeModule?.unload();
   }
 
   /**
@@ -105,13 +103,13 @@ export class SpeechToTextModule {
   public async transcribe(
     waveform: Float32Array,
     options: DecodingOptions = {}
-  ): Promise<string> {
+  ): Promise<TranscriptionResult> {
     this.validateOptions(options);
-    const transcriptionBytes = await this.nativeModule.transcribe(
+    return await this.nativeModule.transcribe(
       waveform,
-      options.language || ''
+      options.language || '',
+      !!options.verbose
     );
-    return this.textDecoder.decode(new Uint8Array(transcriptionBytes));
   }
 
   /**
@@ -126,12 +124,20 @@ export class SpeechToTextModule {
    * @param options - Decoding options including language.
    * @returns An async generator yielding transcription updates.
    */
-  public async *stream(
-    options: DecodingOptions = {}
-  ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
+  public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
+    committed: TranscriptionResult;
+    nonCommitted: TranscriptionResult;
+  }> {
     this.validateOptions(options);
 
-    const queue: { committed: string; nonCommitted: string }[] = [];
+    const verbose = !!options.verbose;
+    const language = options.language || '';
+
+    const queue: {
+      committed: TranscriptionResult;
+      nonCommitted: TranscriptionResult;
+    }[] = [];
+
     let waiter: (() => void) | null = null;
     let finished = false;
     let error: unknown;
@@ -144,20 +150,25 @@ export class SpeechToTextModule {
     (async () => {
       try {
         await this.nativeModule.stream(
-          (committed: number[], nonCommitted: number[], isDone: boolean) => {
+          (
+            committed: TranscriptionResult,
+            nonCommitted: TranscriptionResult,
+            isDone: boolean
+          ) => {
             queue.push({
-              committed: this.textDecoder.decode(new Uint8Array(committed)),
-              nonCommitted: this.textDecoder.decode(
-                new Uint8Array(nonCommitted)
-              ),
+              committed,
+              nonCommitted,
             });
+
             if (isDone) {
               finished = true;
             }
             wake();
           },
-          options.language || ''
+          language,
+          verbose
         );
+
         finished = true;
         wake();
       } catch (e) {
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index ece8d6020..bf7fc6436 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -43,18 +43,6 @@ export interface SpeechToTextType {
    */
   downloadProgress: number;
 
-  /**
-   * Contains the part of the transcription that is finalized and will not change.
-   * Useful for displaying stable results during streaming.
-   */
-  committedTranscription: string;
-
-  /**
-   * Contains the part of the transcription that is still being processed and may change.
-   * Useful for displaying live, partial results during streaming.
-   */
-  nonCommittedTranscription: string;
-
   /**
    * Runs the encoding part of the model on the provided waveform.
    * @param waveform - The input audio waveform array.
@@ -76,22 +64,31 @@ export interface SpeechToTextType {
   /**
    * Starts a transcription process for a given input array, which should be a waveform at 16kHz.
    * @param waveform - The input audio waveform.
-   * @param options - Decoding options, e.g. `{ language: 'es' }` for multilingual models.
-   * @returns Resolves a promise with the output transcription when the model is finished.
+   * @param options - Decoding options, check API reference for more details.
+   * @returns Resolves a promise with the output transcription. Result of transcription is
+   * object of type `TranscriptionResult`.
    */
   transcribe(
     waveform: Float32Array,
     options?: DecodingOptions | undefined
-  ): Promise<string>;
+  ): Promise<TranscriptionResult>;
 
   /**
    * Starts a streaming transcription process.
    * Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
    * Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.
    * @param options - Decoding options including language.
-   * @returns The final transcription string.
+   * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
+   * Both `committed` and `nonCommitted` are of type `TranscriptionResult`
    */
-  stream(options?: DecodingOptions | undefined): Promise<string>;
+  stream(options?: DecodingOptions | undefined): AsyncGenerator<
+    {
+      committed: TranscriptionResult;
+      nonCommitted: TranscriptionResult;
+    },
+    void,
+    unknown
+  >;
 
   /**
    * Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
@@ -192,9 +189,70 @@ export type SpeechToTextLanguage =
  *
  * @category Types
  * @property {SpeechToTextLanguage} [language] - Optional language code to guide the transcription.
+ * @property {boolean} [verbose] - Optional flag. If set, transcription result is presented with timestamps
+ * and with additional parameters. For more details please refer to `TranscriptionResult`.
  */
 export interface DecodingOptions {
   language?: SpeechToTextLanguage;
+  verbose?: boolean;
+}
+
+/**
+ * Structure that represent single token with timestamp information.
+ *
+ * @category Types
+ * @property {string} [word] - Token as a string value.
+ * @property {number} [start] - Timestamp of the beginning of the token in audio (in seconds).
+ * @property {number} [end] - Timestamp of the end of the token in audio (in seconds).
+ */
+export interface Word {
+  word: string;
+  start: number;
+  end: number;
+}
+
+/**
+ * Structure that represent single Segment of transcription.
+ *
+ * @category Types
+ * @property {number} [start] - Timestamp of the beginning of the segment in audio (in seconds).
+ * @property {number} [end] - Timestamp of the end of the segment in audio (in seconds).
+ * @property {string} [text] - Full text of the given segment as a string.
+ * @property {Word[]} [words] - If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
+ * as an array of `Word`.
+ * @property {number[]} [tokens] - Raw tokens represented as table of integers.
+ * @property {number} [temperature] - Temperature for which given segment was computed.
+ * @property {number} [avgLogprob] - Average log probability calculated across all tokens in a segment.
+ * @property {number} [compressionRatio] - Compression ration achieved on a given segment.
+ */
+export interface TranscriptionSegment {
+  start: number;
+  end: number;
+  text: string;
+  words?: Word[];
+  tokens: number[];
+  temperature: number;
+  avgLogprob: number;
+  compressionRatio: number;
+}
+
+/**
+ * Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
+ *
+ * @category Types
+ * @property {'transcribe' | 'stream'} [task] - String indicating task, either 'transcribe' or 'stream'.
+ * @property {string} [language] - Language chosen for transcription.
+ * @property {number} [duration] - Duration in seconds of a given transcription.
+ * @property {string} [text] - The whole text of a transcription as a `string`.
+ * @property {TranscriptionSegment[]} [segments] - If `verbose` set to `true` in `DecodingOptions`, it contains array of
+ * `TranscriptionSegment` with details split into separate transcription segments.
+ */
+export interface TranscriptionResult {
+  task?: 'transcribe' | 'stream';
+  language: string;
+  duration: number;
+  text: string;
+  segments?: TranscriptionSegment[]; // Present if verbose=true
 }
 
 /**
diff --git a/yarn.lock b/yarn.lock
index 1ca8d5d29..ccc0d534e 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -12847,15 +12847,17 @@ __metadata:
   languageName: node
   linkType: hard
 
-"react-native-audio-api@npm:0.6.5":
-  version: 0.6.5
-  resolution: "react-native-audio-api@npm:0.6.5"
+"react-native-audio-api@npm:0.11.3":
+  version: 0.11.3
+  resolution: "react-native-audio-api@npm:0.11.3"
+  dependencies:
+    semver: "npm:^7.7.3"
   peerDependencies:
     react: "*"
     react-native: "*"
   bin:
     setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
-  checksum: 10/9bf5b124ff902f359a237bcd3c386a37b354cc6263ce66765c1788c7a8d42c307a133780c8b57ab2f0db530bfed8ac1d3ff8fb55055228854ccebc8da9a595d8
+  checksum: 10/0c973061a81196f93cb19eecf3c6126deaaaa28ad89c7830cdcfc833faad594fb275b733d12729aa11676e0d571c493006fdef4bf243a79dcb142666e87bcf0b
   languageName: node
   linkType: hard
 
@@ -13708,6 +13710,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"semver@npm:^7.7.3":
+  version: 7.7.4
+  resolution: "semver@npm:7.7.4"
+  bin:
+    semver: bin/semver.js
+  checksum: 10/26bdc6d58b29528f4142d29afb8526bc335f4fc04c4a10f2b98b217f277a031c66736bf82d3d3bb354a2f6a3ae50f18fd62b053c4ac3f294a3d10a61f5075b75
+  languageName: node
+  linkType: hard
+
 "semver@npm:~7.6.3":
   version: 7.6.3
   resolution: "semver@npm:7.6.3"
@@ -14089,7 +14100,7 @@ __metadata:
     metro-config: "npm:^0.81.0"
     react: "npm:19.1.0"
     react-native: "npm:0.81.5"
-    react-native-audio-api: "npm:0.6.5"
+    react-native-audio-api: "npm:0.11.3"
     react-native-device-info: "npm:^14.0.4"
     react-native-executorch: "workspace:*"
     react-native-reanimated: "npm:~4.1.1"