Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/khaki-bobcats-swim.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@openai/agents-realtime': patch
---

fix: #523 transcript removal issue when being interrupted
3 changes: 1 addition & 2 deletions examples/realtime-next/src/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import {
OutputGuardrailTripwireTriggered,
RealtimeItem,
RealtimeContextData,
backgroundResult,
} from '@openai/agents/realtime';
import { useEffect, useRef, useState } from 'react';
import { z } from 'zod';
Expand Down Expand Up @@ -39,7 +38,7 @@ const weatherTool = tool({
location: z.string(),
}),
execute: async ({ location }) => {
return backgroundResult(`The weather in ${location} is sunny.`);
return `The weather in ${location} is sunny.`;
},
});

Expand Down
3 changes: 1 addition & 2 deletions examples/realtime-next/src/app/websocket/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import {
RealtimeItem,
OutputGuardrailTripwireTriggered,
RealtimeOutputGuardrail,
backgroundResult,
} from '@openai/agents/realtime';
import { useEffect, useRef, useState } from 'react';
import { z } from 'zod';
Expand Down Expand Up @@ -52,7 +51,7 @@ const weatherTool = tool({
location: z.string(),
}),
execute: async ({ location }) => {
return backgroundResult(`The weather in ${location} is sunny.`);
return `The weather in ${location} is sunny.`;
},
});

Expand Down
9 changes: 9 additions & 0 deletions packages/agents-realtime/src/openaiRealtimeBase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,14 @@ export abstract class OpenAIRealtimeBase

abstract readonly muted: boolean | null;

/**
* Hook for subclasses to clean up transport-specific state when audio
* playback finishes. Defaults to a no-op.
*/
protected _afterAudioDoneEvent(): void {
// Intentionally empty.
}

protected get _rawSessionConfig(): Record<string, any> | null {
return this.#rawSessionConfig ?? null;
}
Expand Down Expand Up @@ -252,6 +260,7 @@ export abstract class OpenAIRealtimeBase

if (parsed.type === 'response.output_audio.done') {
this.emit('audio_done');
this._afterAudioDoneEvent();
return;
}

Expand Down
4 changes: 4 additions & 0 deletions packages/agents-realtime/src/openaiRealtimeWebRtc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,10 @@ export class OpenAIRealtimeWebRTC
}
}

protected override _afterAudioDoneEvent() {
this.#ongoingResponse = false;
}

/**
* Close the connection to the Realtime API and disconnects the underlying WebRTC connection.
*/
Expand Down
15 changes: 11 additions & 4 deletions packages/agents-realtime/src/openaiRealtimeWebsocket.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ export class OpenAIRealtimeWebSocket
#ongoingResponse: boolean = false;
#createWebSocket?: (options: CreateWebSocketOptions) => Promise<WebSocket>;
#skipOpenEventListeners?: boolean;
#resetAudioPlaybackState() {
this.#currentItemId = undefined;
this._firstAudioTimestamp = undefined;
this._audioLengthMs = 0;
this.#currentAudioContentIndex = undefined;
}

constructor(options: OpenAIRealtimeWebSocketOptions = {}) {
super(options);
Expand Down Expand Up @@ -159,6 +165,10 @@ export class OpenAIRealtimeWebSocket
this.emit('audio', audioEvent);
}

protected override _afterAudioDoneEvent() {
this.#resetAudioPlaybackState();
}

async #setupWebSocket(
resolve: (value: void) => void,
reject: (reason?: any) => void,
Expand Down Expand Up @@ -471,9 +481,6 @@ export class OpenAIRealtimeWebSocket
this._interrupt(elapsedTime, cancelOngoingResponse);
}

this.#currentItemId = undefined;
this._firstAudioTimestamp = undefined;
this._audioLengthMs = 0;
this.#currentAudioContentIndex = undefined;
this.#resetAudioPlaybackState();
}
}
52 changes: 51 additions & 1 deletion packages/agents-realtime/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,51 @@ export function removeAudioFromContent(
return item;
}

// Realtime can resend truncated assistant items without transcripts after an
// interrupt/retrieve cycle. This helper merges those updates with the latest
// known transcript so UIs retain the portion of the message the user already
// heard.
function preserveAssistantAudioTranscripts(
existing: RealtimeMessageItem,
incoming: RealtimeMessageItem,
): RealtimeMessageItem {
if (existing.role !== 'assistant' || incoming.role !== 'assistant') {
return incoming;
}

const mergedContent = incoming.content.map((entry, index) => {
if (entry.type !== 'output_audio') {
return entry;
}

const transcriptMissing =
typeof entry.transcript !== 'string' || entry.transcript.length === 0;
if (!transcriptMissing) {
return entry;
}

const previousEntry = existing.content[index];
if (
previousEntry &&
previousEntry.type === 'output_audio' &&
typeof previousEntry.transcript === 'string' &&
previousEntry.transcript.length > 0
) {
return {
...entry,
transcript: previousEntry.transcript,
};
}

return entry;
});

return {
...incoming,
content: mergedContent,
};
}

/**
* Updates the realtime history array based on the incoming event and options.
* @param history - The current history array.
Expand Down Expand Up @@ -230,10 +275,15 @@ export function updateRealtimeHistory(
);

if (existingIndex !== -1) {
const existingItem = history[existingIndex];
const mergedEvent =
newEvent.type === 'message' && existingItem.type === 'message'
? preserveAssistantAudioTranscripts(existingItem, newEvent)
: newEvent;
// Update existing item
return history.map((item, idx) => {
if (idx === existingIndex) {
return newEvent;
return mergedEvent;
}
if (!shouldIncludeAudioData && item.type === 'message') {
return removeAudioFromContent(item as any);
Expand Down
37 changes: 37 additions & 0 deletions packages/agents-realtime/test/openaiRealtimeWebRtc.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,43 @@ describe('OpenAIRealtimeWebRTC.interrupt', () => {
});
});

it('stops sending response.cancel once audio playback is done', async () => {
const rtc = new OpenAIRealtimeWebRTC();
await rtc.connect({ apiKey: 'ek_test' });

const channel = lastChannel as FakeRTCDataChannel;
channel.dispatchEvent(
new MessageEvent('message', {
data: JSON.stringify({
type: 'response.created',
event_id: 'rc-1',
response: {},
}),
}),
);

channel.dispatchEvent(
new MessageEvent('message', {
data: JSON.stringify({
type: 'response.output_audio.done',
event_id: 'rc-done-1',
item_id: 'item-1',
content_index: 0,
output_index: 0,
response_id: 'resp-1',
}),
}),
);

channel.sent.length = 0;
rtc.interrupt();

expect(channel.sent).toHaveLength(1);
expect(JSON.parse(channel.sent[0])).toEqual({
type: 'output_audio_buffer.clear',
});
});

it('updates currentModel on connect', async () => {
const rtc = new OpenAIRealtimeWebRTC();
await rtc.connect({ apiKey: 'ek_test', model: 'rtc-model' });
Expand Down
46 changes: 46 additions & 0 deletions packages/agents-realtime/test/openaiRealtimeWebsocket.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,52 @@ describe('OpenAIRealtimeWebSocket', () => {
).toBe(true);
});

it('does not send truncate events once audio playback completed', async () => {
const ws = new OpenAIRealtimeWebSocket();
const sendSpy = vi
.spyOn(ws as any, 'sendEvent')
.mockImplementation(() => {});
const p = ws.connect({ apiKey: 'ek', model: 'm' });
await vi.runAllTimersAsync();
await p;

lastFakeSocket!.emit('message', {
data: JSON.stringify({
type: 'response.output_audio.delta',
event_id: 'delta-1',
item_id: 'item-a',
content_index: 0,
delta: 'AA==',
output_index: 0,
response_id: 'resp-a',
}),
});

lastFakeSocket!.emit('message', {
data: JSON.stringify({
type: 'response.output_audio.done',
event_id: 'done-1',
item_id: 'item-a',
content_index: 0,
output_index: 0,
response_id: 'resp-a',
}),
});

sendSpy.mockClear();

lastFakeSocket!.emit('message', {
data: JSON.stringify({
type: 'input_audio_buffer.speech_started',
event_id: 'speech-1',
item_id: 'unused',
audio_start_ms: 0,
}),
});

expect(sendSpy).not.toHaveBeenCalled();
});

it('sendEvent throws when not connected', () => {
const ws = new OpenAIRealtimeWebSocket();
expect(() => ws.sendEvent({ type: 'noop' } as any)).toThrow();
Expand Down
56 changes: 56 additions & 0 deletions packages/agents-realtime/test/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,62 @@ describe('realtime utils', () => {
expect((result[1] as any).content[0].audio).toBeNull();
});

it('preserves assistant output_audio transcript when new item lacks it', () => {
const transcript = 'previous text';
const history: RealtimeMessageItem[] = [
{
itemId: '2',
type: 'message',
role: 'assistant',
status: 'completed',
content: [{ type: 'output_audio', transcript }],
},
];

const incoming: RealtimeMessageItem = {
itemId: '2',
type: 'message',
role: 'assistant',
status: 'incomplete',
content: [{ type: 'output_audio' } as any],
} as RealtimeMessageItem;

const updated = updateRealtimeHistory(history, incoming, false);
expect(updated).toHaveLength(1);
const updatedMessage = updated[0] as RealtimeMessageItem;
const content = (updatedMessage as RealtimeMessageItem).content[0] as any;
expect(content.transcript).toBe(transcript);
if (updatedMessage.role === 'assistant' || updatedMessage.role === 'user') {
expect(updatedMessage.status).toBe('incomplete');
} else {
throw new Error('Expected assistant message to retain transcript');
}
});

it('prefers new transcript value when provided', () => {
const history: RealtimeMessageItem[] = [
{
itemId: '3',
type: 'message',
role: 'assistant',
status: 'completed',
content: [{ type: 'output_audio', transcript: 'old' }],
},
];

const incoming: RealtimeMessageItem = {
itemId: '3',
type: 'message',
role: 'assistant',
status: 'completed',
content: [{ type: 'output_audio', transcript: 'new' }],
} as RealtimeMessageItem;

const updated = updateRealtimeHistory(history, incoming, false);
const content = (updated[0] as RealtimeMessageItem).content[0] as any;
expect(content.transcript).toBe('new');
});

it('removeAudioFromContent strips input and output audio', () => {
const userItem: RealtimeMessageItem = {
itemId: 'u1',
Expand Down