fix: #523 transcript removal issue when being interrupted (#674)

seratch · web-flow · commit 46df17d81d73 · 2025-11-21T15:40:55.000+09:00
diff --git a/.changeset/khaki-bobcats-swim.md b/.changeset/khaki-bobcats-swim.md
@@ -0,0 +1,5 @@
+---
+'@openai/agents-realtime': patch
+---
+
+fix: #523 transcript removal issue when being interrupted
diff --git a/examples/realtime-next/src/app/page.tsx b/examples/realtime-next/src/app/page.tsx
@@ -9,7 +9,6 @@ import {
   OutputGuardrailTripwireTriggered,
   RealtimeItem,
   RealtimeContextData,
-  backgroundResult,
 } from '@openai/agents/realtime';
 import { useEffect, useRef, useState } from 'react';
 import { z } from 'zod';
@@ -39,7 +38,7 @@ const weatherTool = tool({
     location: z.string(),
   }),
   execute: async ({ location }) => {
-    return backgroundResult(`The weather in ${location} is sunny.`);
+    return `The weather in ${location} is sunny.`;
   },
 });
 
diff --git a/examples/realtime-next/src/app/websocket/page.tsx b/examples/realtime-next/src/app/websocket/page.tsx
@@ -8,7 +8,6 @@ import {
   RealtimeItem,
   OutputGuardrailTripwireTriggered,
   RealtimeOutputGuardrail,
-  backgroundResult,
 } from '@openai/agents/realtime';
 import { useEffect, useRef, useState } from 'react';
 import { z } from 'zod';
@@ -52,7 +51,7 @@ const weatherTool = tool({
     location: z.string(),
   }),
   execute: async ({ location }) => {
-    return backgroundResult(`The weather in ${location} is sunny.`);
+    return `The weather in ${location} is sunny.`;
   },
 });
 
diff --git a/packages/agents-realtime/src/openaiRealtimeBase.ts b/packages/agents-realtime/src/openaiRealtimeBase.ts
@@ -167,6 +167,14 @@ export abstract class OpenAIRealtimeBase
 
   abstract readonly muted: boolean | null;
 
+  /**
+   * Hook for subclasses to clean up transport-specific state when audio
+   * playback finishes. Defaults to a no-op.
+   */
+  protected _afterAudioDoneEvent(): void {
+    // Intentionally empty.
+  }
+
   protected get _rawSessionConfig(): Record<string, any> | null {
     return this.#rawSessionConfig ?? null;
   }
@@ -252,6 +260,7 @@ export abstract class OpenAIRealtimeBase
 
     if (parsed.type === 'response.output_audio.done') {
       this.emit('audio_done');
+      this._afterAudioDoneEvent();
       return;
     }
 
diff --git a/packages/agents-realtime/src/openaiRealtimeWebRtc.ts b/packages/agents-realtime/src/openaiRealtimeWebRtc.ts
@@ -348,6 +348,10 @@ export class OpenAIRealtimeWebRTC
     }
   }
 
+  protected override _afterAudioDoneEvent() {
+    this.#ongoingResponse = false;
+  }
+
   /**
    * Close the connection to the Realtime API and disconnects the underlying WebRTC connection.
    */
diff --git a/packages/agents-realtime/src/openaiRealtimeWebsocket.ts b/packages/agents-realtime/src/openaiRealtimeWebsocket.ts
@@ -105,6 +105,12 @@ export class OpenAIRealtimeWebSocket
   #ongoingResponse: boolean = false;
   #createWebSocket?: (options: CreateWebSocketOptions) => Promise<WebSocket>;
   #skipOpenEventListeners?: boolean;
+  #resetAudioPlaybackState() {
+    this.#currentItemId = undefined;
+    this._firstAudioTimestamp = undefined;
+    this._audioLengthMs = 0;
+    this.#currentAudioContentIndex = undefined;
+  }
 
   constructor(options: OpenAIRealtimeWebSocketOptions = {}) {
     super(options);
@@ -159,6 +165,10 @@ export class OpenAIRealtimeWebSocket
     this.emit('audio', audioEvent);
   }
 
+  protected override _afterAudioDoneEvent() {
+    this.#resetAudioPlaybackState();
+  }
+
   async #setupWebSocket(
     resolve: (value: void) => void,
     reject: (reason?: any) => void,
@@ -471,9 +481,6 @@ export class OpenAIRealtimeWebSocket
       this._interrupt(elapsedTime, cancelOngoingResponse);
     }
 
-    this.#currentItemId = undefined;
-    this._firstAudioTimestamp = undefined;
-    this._audioLengthMs = 0;
-    this.#currentAudioContentIndex = undefined;
+    this.#resetAudioPlaybackState();
   }
 }
diff --git a/packages/agents-realtime/src/utils.ts b/packages/agents-realtime/src/utils.ts
@@ -180,6 +180,51 @@ export function removeAudioFromContent(
   return item;
 }
 
+// Realtime can resend truncated assistant items without transcripts after an
+// interrupt/retrieve cycle. This helper merges those updates with the latest
+// known transcript so UIs retain the portion of the message the user already
+// heard.
+function preserveAssistantAudioTranscripts(
+  existing: RealtimeMessageItem,
+  incoming: RealtimeMessageItem,
+): RealtimeMessageItem {
+  if (existing.role !== 'assistant' || incoming.role !== 'assistant') {
+    return incoming;
+  }
+
+  const mergedContent = incoming.content.map((entry, index) => {
+    if (entry.type !== 'output_audio') {
+      return entry;
+    }
+
+    const transcriptMissing =
+      typeof entry.transcript !== 'string' || entry.transcript.length === 0;
+    if (!transcriptMissing) {
+      return entry;
+    }
+
+    const previousEntry = existing.content[index];
+    if (
+      previousEntry &&
+      previousEntry.type === 'output_audio' &&
+      typeof previousEntry.transcript === 'string' &&
+      previousEntry.transcript.length > 0
+    ) {
+      return {
+        ...entry,
+        transcript: previousEntry.transcript,
+      };
+    }
+
+    return entry;
+  });
+
+  return {
+    ...incoming,
+    content: mergedContent,
+  };
+}
+
 /**
  * Updates the realtime history array based on the incoming event and options.
  * @param history - The current history array.
@@ -230,10 +275,15 @@ export function updateRealtimeHistory(
   );
 
   if (existingIndex !== -1) {
+    const existingItem = history[existingIndex];
+    const mergedEvent =
+      newEvent.type === 'message' && existingItem.type === 'message'
+        ? preserveAssistantAudioTranscripts(existingItem, newEvent)
+        : newEvent;
     // Update existing item
     return history.map((item, idx) => {
       if (idx === existingIndex) {
-        return newEvent;
+        return mergedEvent;
       }
       if (!shouldIncludeAudioData && item.type === 'message') {
         return removeAudioFromContent(item as any);
diff --git a/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts b/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts
@@ -149,6 +149,43 @@ describe('OpenAIRealtimeWebRTC.interrupt', () => {
     });
   });
 
+  it('stops sending response.cancel once audio playback is done', async () => {
+    const rtc = new OpenAIRealtimeWebRTC();
+    await rtc.connect({ apiKey: 'ek_test' });
+
+    const channel = lastChannel as FakeRTCDataChannel;
+    channel.dispatchEvent(
+      new MessageEvent('message', {
+        data: JSON.stringify({
+          type: 'response.created',
+          event_id: 'rc-1',
+          response: {},
+        }),
+      }),
+    );
+
+    channel.dispatchEvent(
+      new MessageEvent('message', {
+        data: JSON.stringify({
+          type: 'response.output_audio.done',
+          event_id: 'rc-done-1',
+          item_id: 'item-1',
+          content_index: 0,
+          output_index: 0,
+          response_id: 'resp-1',
+        }),
+      }),
+    );
+
+    channel.sent.length = 0;
+    rtc.interrupt();
+
+    expect(channel.sent).toHaveLength(1);
+    expect(JSON.parse(channel.sent[0])).toEqual({
+      type: 'output_audio_buffer.clear',
+    });
+  });
+
   it('updates currentModel on connect', async () => {
     const rtc = new OpenAIRealtimeWebRTC();
     await rtc.connect({ apiKey: 'ek_test', model: 'rtc-model' });
diff --git a/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts b/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts
@@ -159,6 +159,52 @@ describe('OpenAIRealtimeWebSocket', () => {
     ).toBe(true);
   });
 
+  it('does not send truncate events once audio playback completed', async () => {
+    const ws = new OpenAIRealtimeWebSocket();
+    const sendSpy = vi
+      .spyOn(ws as any, 'sendEvent')
+      .mockImplementation(() => {});
+    const p = ws.connect({ apiKey: 'ek', model: 'm' });
+    await vi.runAllTimersAsync();
+    await p;
+
+    lastFakeSocket!.emit('message', {
+      data: JSON.stringify({
+        type: 'response.output_audio.delta',
+        event_id: 'delta-1',
+        item_id: 'item-a',
+        content_index: 0,
+        delta: 'AA==',
+        output_index: 0,
+        response_id: 'resp-a',
+      }),
+    });
+
+    lastFakeSocket!.emit('message', {
+      data: JSON.stringify({
+        type: 'response.output_audio.done',
+        event_id: 'done-1',
+        item_id: 'item-a',
+        content_index: 0,
+        output_index: 0,
+        response_id: 'resp-a',
+      }),
+    });
+
+    sendSpy.mockClear();
+
+    lastFakeSocket!.emit('message', {
+      data: JSON.stringify({
+        type: 'input_audio_buffer.speech_started',
+        event_id: 'speech-1',
+        item_id: 'unused',
+        audio_start_ms: 0,
+      }),
+    });
+
+    expect(sendSpy).not.toHaveBeenCalled();
+  });
+
   it('sendEvent throws when not connected', () => {
     const ws = new OpenAIRealtimeWebSocket();
     expect(() => ws.sendEvent({ type: 'noop' } as any)).toThrow();
diff --git a/packages/agents-realtime/test/utils.test.ts b/packages/agents-realtime/test/utils.test.ts
@@ -163,6 +163,62 @@ describe('realtime utils', () => {
     expect((result[1] as any).content[0].audio).toBeNull();
   });
 
+  it('preserves assistant output_audio transcript when new item lacks it', () => {
+    const transcript = 'previous text';
+    const history: RealtimeMessageItem[] = [
+      {
+        itemId: '2',
+        type: 'message',
+        role: 'assistant',
+        status: 'completed',
+        content: [{ type: 'output_audio', transcript }],
+      },
+    ];
+
+    const incoming: RealtimeMessageItem = {
+      itemId: '2',
+      type: 'message',
+      role: 'assistant',
+      status: 'incomplete',
+      content: [{ type: 'output_audio' } as any],
+    } as RealtimeMessageItem;
+
+    const updated = updateRealtimeHistory(history, incoming, false);
+    expect(updated).toHaveLength(1);
+    const updatedMessage = updated[0] as RealtimeMessageItem;
+    const content = (updatedMessage as RealtimeMessageItem).content[0] as any;
+    expect(content.transcript).toBe(transcript);
+    if (updatedMessage.role === 'assistant' || updatedMessage.role === 'user') {
+      expect(updatedMessage.status).toBe('incomplete');
+    } else {
+      throw new Error('Expected assistant message to retain transcript');
+    }
+  });
+
+  it('prefers new transcript value when provided', () => {
+    const history: RealtimeMessageItem[] = [
+      {
+        itemId: '3',
+        type: 'message',
+        role: 'assistant',
+        status: 'completed',
+        content: [{ type: 'output_audio', transcript: 'old' }],
+      },
+    ];
+
+    const incoming: RealtimeMessageItem = {
+      itemId: '3',
+      type: 'message',
+      role: 'assistant',
+      status: 'completed',
+      content: [{ type: 'output_audio', transcript: 'new' }],
+    } as RealtimeMessageItem;
+
+    const updated = updateRealtimeHistory(history, incoming, false);
+    const content = (updated[0] as RealtimeMessageItem).content[0] as any;
+    expect(content.transcript).toBe('new');
+  });
+
   it('removeAudioFromContent strips input and output audio', () => {
     const userItem: RealtimeMessageItem = {
       itemId: 'u1',

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +'@openai/agents-realtime': patch
 +---
++
 +fix: #523 transcript removal issue when being interrupted
Original file line number	Diff line number	Diff line change
`@@ -348,6 +348,10 @@ export class OpenAIRealtimeWebRTC`
`348`	`348`	`}`
`349`	`349`	`}`
`350`	`350`
	`351`	`+ protected override _afterAudioDoneEvent() {`
	`352`	`+ this.#ongoingResponse = false;`
	`353`	`+ }`
	`354`	`+`
`351`	`355`	`/**`
`352`	`356`	`* Close the connection to the Realtime API and disconnects the underlying WebRTC connection.`
`353`	`357`	`*/`