Skip to content

Commit 46df17d

Browse files
authored
fix: #523 transcript removal issue when being interrupted (#674)
1 parent 7a3a1b6 commit 46df17d

File tree

10 files changed

+221
-9
lines changed

10 files changed

+221
-9
lines changed

.changeset/khaki-bobcats-swim.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@openai/agents-realtime': patch
3+
---
4+
5+
fix: #523 transcript removal issue when being interrupted

examples/realtime-next/src/app/page.tsx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import {
99
OutputGuardrailTripwireTriggered,
1010
RealtimeItem,
1111
RealtimeContextData,
12-
backgroundResult,
1312
} from '@openai/agents/realtime';
1413
import { useEffect, useRef, useState } from 'react';
1514
import { z } from 'zod';
@@ -39,7 +38,7 @@ const weatherTool = tool({
3938
location: z.string(),
4039
}),
4140
execute: async ({ location }) => {
42-
return backgroundResult(`The weather in ${location} is sunny.`);
41+
return `The weather in ${location} is sunny.`;
4342
},
4443
});
4544

examples/realtime-next/src/app/websocket/page.tsx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import {
88
RealtimeItem,
99
OutputGuardrailTripwireTriggered,
1010
RealtimeOutputGuardrail,
11-
backgroundResult,
1211
} from '@openai/agents/realtime';
1312
import { useEffect, useRef, useState } from 'react';
1413
import { z } from 'zod';
@@ -52,7 +51,7 @@ const weatherTool = tool({
5251
location: z.string(),
5352
}),
5453
execute: async ({ location }) => {
55-
return backgroundResult(`The weather in ${location} is sunny.`);
54+
return `The weather in ${location} is sunny.`;
5655
},
5756
});
5857

packages/agents-realtime/src/openaiRealtimeBase.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,14 @@ export abstract class OpenAIRealtimeBase
167167

168168
abstract readonly muted: boolean | null;
169169

170+
/**
171+
* Hook for subclasses to clean up transport-specific state when audio
172+
* playback finishes. Defaults to a no-op.
173+
*/
174+
protected _afterAudioDoneEvent(): void {
175+
// Intentionally empty.
176+
}
177+
170178
protected get _rawSessionConfig(): Record<string, any> | null {
171179
return this.#rawSessionConfig ?? null;
172180
}
@@ -252,6 +260,7 @@ export abstract class OpenAIRealtimeBase
252260

253261
if (parsed.type === 'response.output_audio.done') {
254262
this.emit('audio_done');
263+
this._afterAudioDoneEvent();
255264
return;
256265
}
257266

packages/agents-realtime/src/openaiRealtimeWebRtc.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,10 @@ export class OpenAIRealtimeWebRTC
348348
}
349349
}
350350

351+
protected override _afterAudioDoneEvent() {
352+
this.#ongoingResponse = false;
353+
}
354+
351355
/**
352356
* Close the connection to the Realtime API and disconnects the underlying WebRTC connection.
353357
*/

packages/agents-realtime/src/openaiRealtimeWebsocket.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ export class OpenAIRealtimeWebSocket
105105
#ongoingResponse: boolean = false;
106106
#createWebSocket?: (options: CreateWebSocketOptions) => Promise<WebSocket>;
107107
#skipOpenEventListeners?: boolean;
108+
#resetAudioPlaybackState() {
109+
this.#currentItemId = undefined;
110+
this._firstAudioTimestamp = undefined;
111+
this._audioLengthMs = 0;
112+
this.#currentAudioContentIndex = undefined;
113+
}
108114

109115
constructor(options: OpenAIRealtimeWebSocketOptions = {}) {
110116
super(options);
@@ -159,6 +165,10 @@ export class OpenAIRealtimeWebSocket
159165
this.emit('audio', audioEvent);
160166
}
161167

168+
protected override _afterAudioDoneEvent() {
169+
this.#resetAudioPlaybackState();
170+
}
171+
162172
async #setupWebSocket(
163173
resolve: (value: void) => void,
164174
reject: (reason?: any) => void,
@@ -471,9 +481,6 @@ export class OpenAIRealtimeWebSocket
471481
this._interrupt(elapsedTime, cancelOngoingResponse);
472482
}
473483

474-
this.#currentItemId = undefined;
475-
this._firstAudioTimestamp = undefined;
476-
this._audioLengthMs = 0;
477-
this.#currentAudioContentIndex = undefined;
484+
this.#resetAudioPlaybackState();
478485
}
479486
}

packages/agents-realtime/src/utils.ts

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,51 @@ export function removeAudioFromContent(
180180
return item;
181181
}
182182

183+
// Realtime can resend truncated assistant items without transcripts after an
184+
// interrupt/retrieve cycle. This helper merges those updates with the latest
185+
// known transcript so UIs retain the portion of the message the user already
186+
// heard.
187+
function preserveAssistantAudioTranscripts(
188+
existing: RealtimeMessageItem,
189+
incoming: RealtimeMessageItem,
190+
): RealtimeMessageItem {
191+
if (existing.role !== 'assistant' || incoming.role !== 'assistant') {
192+
return incoming;
193+
}
194+
195+
const mergedContent = incoming.content.map((entry, index) => {
196+
if (entry.type !== 'output_audio') {
197+
return entry;
198+
}
199+
200+
const transcriptMissing =
201+
typeof entry.transcript !== 'string' || entry.transcript.length === 0;
202+
if (!transcriptMissing) {
203+
return entry;
204+
}
205+
206+
const previousEntry = existing.content[index];
207+
if (
208+
previousEntry &&
209+
previousEntry.type === 'output_audio' &&
210+
typeof previousEntry.transcript === 'string' &&
211+
previousEntry.transcript.length > 0
212+
) {
213+
return {
214+
...entry,
215+
transcript: previousEntry.transcript,
216+
};
217+
}
218+
219+
return entry;
220+
});
221+
222+
return {
223+
...incoming,
224+
content: mergedContent,
225+
};
226+
}
227+
183228
/**
184229
* Updates the realtime history array based on the incoming event and options.
185230
* @param history - The current history array.
@@ -230,10 +275,15 @@ export function updateRealtimeHistory(
230275
);
231276

232277
if (existingIndex !== -1) {
278+
const existingItem = history[existingIndex];
279+
const mergedEvent =
280+
newEvent.type === 'message' && existingItem.type === 'message'
281+
? preserveAssistantAudioTranscripts(existingItem, newEvent)
282+
: newEvent;
233283
// Update existing item
234284
return history.map((item, idx) => {
235285
if (idx === existingIndex) {
236-
return newEvent;
286+
return mergedEvent;
237287
}
238288
if (!shouldIncludeAudioData && item.type === 'message') {
239289
return removeAudioFromContent(item as any);

packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,43 @@ describe('OpenAIRealtimeWebRTC.interrupt', () => {
149149
});
150150
});
151151

152+
it('stops sending response.cancel once audio playback is done', async () => {
153+
const rtc = new OpenAIRealtimeWebRTC();
154+
await rtc.connect({ apiKey: 'ek_test' });
155+
156+
const channel = lastChannel as FakeRTCDataChannel;
157+
channel.dispatchEvent(
158+
new MessageEvent('message', {
159+
data: JSON.stringify({
160+
type: 'response.created',
161+
event_id: 'rc-1',
162+
response: {},
163+
}),
164+
}),
165+
);
166+
167+
channel.dispatchEvent(
168+
new MessageEvent('message', {
169+
data: JSON.stringify({
170+
type: 'response.output_audio.done',
171+
event_id: 'rc-done-1',
172+
item_id: 'item-1',
173+
content_index: 0,
174+
output_index: 0,
175+
response_id: 'resp-1',
176+
}),
177+
}),
178+
);
179+
180+
channel.sent.length = 0;
181+
rtc.interrupt();
182+
183+
expect(channel.sent).toHaveLength(1);
184+
expect(JSON.parse(channel.sent[0])).toEqual({
185+
type: 'output_audio_buffer.clear',
186+
});
187+
});
188+
152189
it('updates currentModel on connect', async () => {
153190
const rtc = new OpenAIRealtimeWebRTC();
154191
await rtc.connect({ apiKey: 'ek_test', model: 'rtc-model' });

packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,52 @@ describe('OpenAIRealtimeWebSocket', () => {
159159
).toBe(true);
160160
});
161161

162+
it('does not send truncate events once audio playback completed', async () => {
163+
const ws = new OpenAIRealtimeWebSocket();
164+
const sendSpy = vi
165+
.spyOn(ws as any, 'sendEvent')
166+
.mockImplementation(() => {});
167+
const p = ws.connect({ apiKey: 'ek', model: 'm' });
168+
await vi.runAllTimersAsync();
169+
await p;
170+
171+
lastFakeSocket!.emit('message', {
172+
data: JSON.stringify({
173+
type: 'response.output_audio.delta',
174+
event_id: 'delta-1',
175+
item_id: 'item-a',
176+
content_index: 0,
177+
delta: 'AA==',
178+
output_index: 0,
179+
response_id: 'resp-a',
180+
}),
181+
});
182+
183+
lastFakeSocket!.emit('message', {
184+
data: JSON.stringify({
185+
type: 'response.output_audio.done',
186+
event_id: 'done-1',
187+
item_id: 'item-a',
188+
content_index: 0,
189+
output_index: 0,
190+
response_id: 'resp-a',
191+
}),
192+
});
193+
194+
sendSpy.mockClear();
195+
196+
lastFakeSocket!.emit('message', {
197+
data: JSON.stringify({
198+
type: 'input_audio_buffer.speech_started',
199+
event_id: 'speech-1',
200+
item_id: 'unused',
201+
audio_start_ms: 0,
202+
}),
203+
});
204+
205+
expect(sendSpy).not.toHaveBeenCalled();
206+
});
207+
162208
it('sendEvent throws when not connected', () => {
163209
const ws = new OpenAIRealtimeWebSocket();
164210
expect(() => ws.sendEvent({ type: 'noop' } as any)).toThrow();

packages/agents-realtime/test/utils.test.ts

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,62 @@ describe('realtime utils', () => {
163163
expect((result[1] as any).content[0].audio).toBeNull();
164164
});
165165

166+
it('preserves assistant output_audio transcript when new item lacks it', () => {
167+
const transcript = 'previous text';
168+
const history: RealtimeMessageItem[] = [
169+
{
170+
itemId: '2',
171+
type: 'message',
172+
role: 'assistant',
173+
status: 'completed',
174+
content: [{ type: 'output_audio', transcript }],
175+
},
176+
];
177+
178+
const incoming: RealtimeMessageItem = {
179+
itemId: '2',
180+
type: 'message',
181+
role: 'assistant',
182+
status: 'incomplete',
183+
content: [{ type: 'output_audio' } as any],
184+
} as RealtimeMessageItem;
185+
186+
const updated = updateRealtimeHistory(history, incoming, false);
187+
expect(updated).toHaveLength(1);
188+
const updatedMessage = updated[0] as RealtimeMessageItem;
189+
const content = (updatedMessage as RealtimeMessageItem).content[0] as any;
190+
expect(content.transcript).toBe(transcript);
191+
if (updatedMessage.role === 'assistant' || updatedMessage.role === 'user') {
192+
expect(updatedMessage.status).toBe('incomplete');
193+
} else {
194+
throw new Error('Expected assistant message to retain transcript');
195+
}
196+
});
197+
198+
it('prefers new transcript value when provided', () => {
199+
const history: RealtimeMessageItem[] = [
200+
{
201+
itemId: '3',
202+
type: 'message',
203+
role: 'assistant',
204+
status: 'completed',
205+
content: [{ type: 'output_audio', transcript: 'old' }],
206+
},
207+
];
208+
209+
const incoming: RealtimeMessageItem = {
210+
itemId: '3',
211+
type: 'message',
212+
role: 'assistant',
213+
status: 'completed',
214+
content: [{ type: 'output_audio', transcript: 'new' }],
215+
} as RealtimeMessageItem;
216+
217+
const updated = updateRealtimeHistory(history, incoming, false);
218+
const content = (updated[0] as RealtimeMessageItem).content[0] as any;
219+
expect(content.transcript).toBe('new');
220+
});
221+
166222
it('removeAudioFromContent strips input and output audio', () => {
167223
const userItem: RealtimeMessageItem = {
168224
itemId: 'u1',

0 commit comments

Comments
 (0)