From f372dbd3f6a1e709e7b1e35849e509b263116c9b Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Wed, 19 Nov 2025 17:25:36 +0900 Subject: [PATCH] fix: #523 transcript removal issue when being interrupted --- .changeset/khaki-bobcats-swim.md | 5 ++ examples/realtime-next/src/app/page.tsx | 3 +- .../realtime-next/src/app/websocket/page.tsx | 3 +- .../agents-realtime/src/openaiRealtimeBase.ts | 9 +++ .../src/openaiRealtimeWebRtc.ts | 4 ++ .../src/openaiRealtimeWebsocket.ts | 15 +++-- packages/agents-realtime/src/utils.ts | 52 ++++++++++++++++- .../test/openaiRealtimeWebRtc.test.ts | 37 ++++++++++++ .../test/openaiRealtimeWebsocket.test.ts | 46 +++++++++++++++ packages/agents-realtime/test/utils.test.ts | 56 +++++++++++++++++++ 10 files changed, 221 insertions(+), 9 deletions(-) create mode 100644 .changeset/khaki-bobcats-swim.md diff --git a/.changeset/khaki-bobcats-swim.md b/.changeset/khaki-bobcats-swim.md new file mode 100644 index 00000000..6e9effb1 --- /dev/null +++ b/.changeset/khaki-bobcats-swim.md @@ -0,0 +1,5 @@ +--- +'@openai/agents-realtime': patch +--- + +fix: #523 transcript removal issue when being interrupted diff --git a/examples/realtime-next/src/app/page.tsx b/examples/realtime-next/src/app/page.tsx index 0d16e3a2..552bbc5e 100644 --- a/examples/realtime-next/src/app/page.tsx +++ b/examples/realtime-next/src/app/page.tsx @@ -9,7 +9,6 @@ import { OutputGuardrailTripwireTriggered, RealtimeItem, RealtimeContextData, - backgroundResult, } from '@openai/agents/realtime'; import { useEffect, useRef, useState } from 'react'; import { z } from 'zod'; @@ -39,7 +38,7 @@ const weatherTool = tool({ location: z.string(), }), execute: async ({ location }) => { - return backgroundResult(`The weather in ${location} is sunny.`); + return `The weather in ${location} is sunny.`; }, }); diff --git a/examples/realtime-next/src/app/websocket/page.tsx b/examples/realtime-next/src/app/websocket/page.tsx index f62bc464..da24351a 100644 --- a/examples/realtime-next/src/app/websocket/page.tsx +++ b/examples/realtime-next/src/app/websocket/page.tsx @@ -8,7 +8,6 @@ import { RealtimeItem, OutputGuardrailTripwireTriggered, RealtimeOutputGuardrail, - backgroundResult, } from '@openai/agents/realtime'; import { useEffect, useRef, useState } from 'react'; import { z } from 'zod'; @@ -52,7 +51,7 @@ const weatherTool = tool({ location: z.string(), }), execute: async ({ location }) => { - return backgroundResult(`The weather in ${location} is sunny.`); + return `The weather in ${location} is sunny.`; }, }); diff --git a/packages/agents-realtime/src/openaiRealtimeBase.ts b/packages/agents-realtime/src/openaiRealtimeBase.ts index a7144de2..21b2f107 100644 --- a/packages/agents-realtime/src/openaiRealtimeBase.ts +++ b/packages/agents-realtime/src/openaiRealtimeBase.ts @@ -167,6 +167,14 @@ export abstract class OpenAIRealtimeBase abstract readonly muted: boolean | null; + /** + * Hook for subclasses to clean up transport-specific state when audio + * playback finishes. Defaults to a no-op. + */ + protected _afterAudioDoneEvent(): void { + // Intentionally empty. + } + protected get _rawSessionConfig(): Record | null { return this.#rawSessionConfig ?? null; } @@ -252,6 +260,7 @@ export abstract class OpenAIRealtimeBase if (parsed.type === 'response.output_audio.done') { this.emit('audio_done'); + this._afterAudioDoneEvent(); return; } diff --git a/packages/agents-realtime/src/openaiRealtimeWebRtc.ts b/packages/agents-realtime/src/openaiRealtimeWebRtc.ts index b822e965..90ea328d 100644 --- a/packages/agents-realtime/src/openaiRealtimeWebRtc.ts +++ b/packages/agents-realtime/src/openaiRealtimeWebRtc.ts @@ -348,6 +348,10 @@ export class OpenAIRealtimeWebRTC } } + protected override _afterAudioDoneEvent() { + this.#ongoingResponse = false; + } + /** * Close the connection to the Realtime API and disconnects the underlying WebRTC connection. */ diff --git a/packages/agents-realtime/src/openaiRealtimeWebsocket.ts b/packages/agents-realtime/src/openaiRealtimeWebsocket.ts index 3ee85c1c..85197ba8 100644 --- a/packages/agents-realtime/src/openaiRealtimeWebsocket.ts +++ b/packages/agents-realtime/src/openaiRealtimeWebsocket.ts @@ -105,6 +105,12 @@ export class OpenAIRealtimeWebSocket #ongoingResponse: boolean = false; #createWebSocket?: (options: CreateWebSocketOptions) => Promise; #skipOpenEventListeners?: boolean; + #resetAudioPlaybackState() { + this.#currentItemId = undefined; + this._firstAudioTimestamp = undefined; + this._audioLengthMs = 0; + this.#currentAudioContentIndex = undefined; + } constructor(options: OpenAIRealtimeWebSocketOptions = {}) { super(options); @@ -159,6 +165,10 @@ export class OpenAIRealtimeWebSocket this.emit('audio', audioEvent); } + protected override _afterAudioDoneEvent() { + this.#resetAudioPlaybackState(); + } + async #setupWebSocket( resolve: (value: void) => void, reject: (reason?: any) => void, @@ -471,9 +481,6 @@ export class OpenAIRealtimeWebSocket this._interrupt(elapsedTime, cancelOngoingResponse); } - this.#currentItemId = undefined; - this._firstAudioTimestamp = undefined; - this._audioLengthMs = 0; - this.#currentAudioContentIndex = undefined; + this.#resetAudioPlaybackState(); } } diff --git a/packages/agents-realtime/src/utils.ts b/packages/agents-realtime/src/utils.ts index bbbd82a2..b3c218ac 100644 --- a/packages/agents-realtime/src/utils.ts +++ b/packages/agents-realtime/src/utils.ts @@ -180,6 +180,51 @@ export function removeAudioFromContent( return item; } +// Realtime can resend truncated assistant items without transcripts after an +// interrupt/retrieve cycle. This helper merges those updates with the latest +// known transcript so UIs retain the portion of the message the user already +// heard. +function preserveAssistantAudioTranscripts( + existing: RealtimeMessageItem, + incoming: RealtimeMessageItem, +): RealtimeMessageItem { + if (existing.role !== 'assistant' || incoming.role !== 'assistant') { + return incoming; + } + + const mergedContent = incoming.content.map((entry, index) => { + if (entry.type !== 'output_audio') { + return entry; + } + + const transcriptMissing = + typeof entry.transcript !== 'string' || entry.transcript.length === 0; + if (!transcriptMissing) { + return entry; + } + + const previousEntry = existing.content[index]; + if ( + previousEntry && + previousEntry.type === 'output_audio' && + typeof previousEntry.transcript === 'string' && + previousEntry.transcript.length > 0 + ) { + return { + ...entry, + transcript: previousEntry.transcript, + }; + } + + return entry; + }); + + return { + ...incoming, + content: mergedContent, + }; +} + /** * Updates the realtime history array based on the incoming event and options. * @param history - The current history array. @@ -230,10 +275,15 @@ export function updateRealtimeHistory( ); if (existingIndex !== -1) { + const existingItem = history[existingIndex]; + const mergedEvent = + newEvent.type === 'message' && existingItem.type === 'message' + ? preserveAssistantAudioTranscripts(existingItem, newEvent) + : newEvent; // Update existing item return history.map((item, idx) => { if (idx === existingIndex) { - return newEvent; + return mergedEvent; } if (!shouldIncludeAudioData && item.type === 'message') { return removeAudioFromContent(item as any); diff --git a/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts b/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts index 6db2da4a..d0da9f26 100644 --- a/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts +++ b/packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts @@ -149,6 +149,43 @@ describe('OpenAIRealtimeWebRTC.interrupt', () => { }); }); + it('stops sending response.cancel once audio playback is done', async () => { + const rtc = new OpenAIRealtimeWebRTC(); + await rtc.connect({ apiKey: 'ek_test' }); + + const channel = lastChannel as FakeRTCDataChannel; + channel.dispatchEvent( + new MessageEvent('message', { + data: JSON.stringify({ + type: 'response.created', + event_id: 'rc-1', + response: {}, + }), + }), + ); + + channel.dispatchEvent( + new MessageEvent('message', { + data: JSON.stringify({ + type: 'response.output_audio.done', + event_id: 'rc-done-1', + item_id: 'item-1', + content_index: 0, + output_index: 0, + response_id: 'resp-1', + }), + }), + ); + + channel.sent.length = 0; + rtc.interrupt(); + + expect(channel.sent).toHaveLength(1); + expect(JSON.parse(channel.sent[0])).toEqual({ + type: 'output_audio_buffer.clear', + }); + }); + it('updates currentModel on connect', async () => { const rtc = new OpenAIRealtimeWebRTC(); await rtc.connect({ apiKey: 'ek_test', model: 'rtc-model' }); diff --git a/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts b/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts index c978ad70..14b0c328 100644 --- a/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts +++ b/packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts @@ -159,6 +159,52 @@ describe('OpenAIRealtimeWebSocket', () => { ).toBe(true); }); + it('does not send truncate events once audio playback completed', async () => { + const ws = new OpenAIRealtimeWebSocket(); + const sendSpy = vi + .spyOn(ws as any, 'sendEvent') + .mockImplementation(() => {}); + const p = ws.connect({ apiKey: 'ek', model: 'm' }); + await vi.runAllTimersAsync(); + await p; + + lastFakeSocket!.emit('message', { + data: JSON.stringify({ + type: 'response.output_audio.delta', + event_id: 'delta-1', + item_id: 'item-a', + content_index: 0, + delta: 'AA==', + output_index: 0, + response_id: 'resp-a', + }), + }); + + lastFakeSocket!.emit('message', { + data: JSON.stringify({ + type: 'response.output_audio.done', + event_id: 'done-1', + item_id: 'item-a', + content_index: 0, + output_index: 0, + response_id: 'resp-a', + }), + }); + + sendSpy.mockClear(); + + lastFakeSocket!.emit('message', { + data: JSON.stringify({ + type: 'input_audio_buffer.speech_started', + event_id: 'speech-1', + item_id: 'unused', + audio_start_ms: 0, + }), + }); + + expect(sendSpy).not.toHaveBeenCalled(); + }); + it('sendEvent throws when not connected', () => { const ws = new OpenAIRealtimeWebSocket(); expect(() => ws.sendEvent({ type: 'noop' } as any)).toThrow(); diff --git a/packages/agents-realtime/test/utils.test.ts b/packages/agents-realtime/test/utils.test.ts index da479dc8..cc4c9d94 100644 --- a/packages/agents-realtime/test/utils.test.ts +++ b/packages/agents-realtime/test/utils.test.ts @@ -163,6 +163,62 @@ describe('realtime utils', () => { expect((result[1] as any).content[0].audio).toBeNull(); }); + it('preserves assistant output_audio transcript when new item lacks it', () => { + const transcript = 'previous text'; + const history: RealtimeMessageItem[] = [ + { + itemId: '2', + type: 'message', + role: 'assistant', + status: 'completed', + content: [{ type: 'output_audio', transcript }], + }, + ]; + + const incoming: RealtimeMessageItem = { + itemId: '2', + type: 'message', + role: 'assistant', + status: 'incomplete', + content: [{ type: 'output_audio' } as any], + } as RealtimeMessageItem; + + const updated = updateRealtimeHistory(history, incoming, false); + expect(updated).toHaveLength(1); + const updatedMessage = updated[0] as RealtimeMessageItem; + const content = (updatedMessage as RealtimeMessageItem).content[0] as any; + expect(content.transcript).toBe(transcript); + if (updatedMessage.role === 'assistant' || updatedMessage.role === 'user') { + expect(updatedMessage.status).toBe('incomplete'); + } else { + throw new Error('Expected assistant message to retain transcript'); + } + }); + + it('prefers new transcript value when provided', () => { + const history: RealtimeMessageItem[] = [ + { + itemId: '3', + type: 'message', + role: 'assistant', + status: 'completed', + content: [{ type: 'output_audio', transcript: 'old' }], + }, + ]; + + const incoming: RealtimeMessageItem = { + itemId: '3', + type: 'message', + role: 'assistant', + status: 'completed', + content: [{ type: 'output_audio', transcript: 'new' }], + } as RealtimeMessageItem; + + const updated = updateRealtimeHistory(history, incoming, false); + const content = (updated[0] as RealtimeMessageItem).content[0] as any; + expect(content.transcript).toBe('new'); + }); + it('removeAudioFromContent strips input and output audio', () => { const userItem: RealtimeMessageItem = { itemId: 'u1',