Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions packages/typescript/ai-openai/src/model-meta.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type {
OpenAIAudioOutputOptions,
OpenAIBaseOptions,
OpenAIMetadataOptions,
OpenAIReasoningOptions,
Expand Down Expand Up @@ -1072,6 +1073,32 @@ const GPT_4O_AUDIO = {
OpenAIMetadataOptions
>

const GPT_4O_AUDIO_PREVIEW = {
name: 'gpt-4o-audio-preview',
context_window: 128_000,
max_output_tokens: 16_384,
knowledge_cutoff: '2023-10-01',
pricing: {
input: {
normal: 2.5,
},
output: {
normal: 10,
},
},
supports: {
input: ['text', 'audio'],
output: ['text', 'audio'],
endpoints: ['chat-completions'],
features: ['streaming', 'function_calling'],
},
} as const satisfies ModelMeta<
OpenAIBaseOptions &
OpenAIToolsOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
>

const GPT_4O_MINI = {
name: 'gpt-4o-mini',
context_window: 128_000,
Expand Down Expand Up @@ -1635,6 +1662,7 @@ export const OPENAI_CHAT_MODELS = [
GPT_AUDIO.name,
GPT_AUDIO_MINI.name,
GPT_4O_AUDIO.name,
GPT_4O_AUDIO_PREVIEW.name,
GPT_4O_MINI_AUDIO.name,
// ChatGPT models
GPT_5_1_CHAT.name,
Expand Down Expand Up @@ -1879,16 +1907,24 @@ export type OpenAIChatModelProviderOptionsByName = {
// Audio models
[GPT_AUDIO.name]: OpenAIBaseOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
OpenAIMetadataOptions &
OpenAIAudioOutputOptions
[GPT_AUDIO_MINI.name]: OpenAIBaseOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
OpenAIMetadataOptions &
OpenAIAudioOutputOptions
[GPT_4O_AUDIO.name]: OpenAIBaseOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
OpenAIMetadataOptions &
OpenAIAudioOutputOptions
[GPT_4O_AUDIO_PREVIEW.name]: OpenAIBaseOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions &
OpenAIAudioOutputOptions
[GPT_4O_MINI_AUDIO.name]: OpenAIBaseOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
OpenAIMetadataOptions &
OpenAIAudioOutputOptions

// Chat-only models
[GPT_5_1_CHAT.name]: OpenAIBaseOptions &
Expand Down Expand Up @@ -1974,6 +2010,7 @@ export type OpenAIModelInputModalitiesByName = {
[GPT_AUDIO.name]: typeof GPT_AUDIO.supports.input
[GPT_AUDIO_MINI.name]: typeof GPT_AUDIO_MINI.supports.input
[GPT_4O_AUDIO.name]: typeof GPT_4O_AUDIO.supports.input
[GPT_4O_AUDIO_PREVIEW.name]: typeof GPT_4O_AUDIO_PREVIEW.supports.input
[GPT_4O_MINI_AUDIO.name]: typeof GPT_4O_MINI_AUDIO.supports.input

// Text-only models
Expand Down
168 changes: 167 additions & 1 deletion packages/typescript/ai-openai/src/openai-adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,172 @@ export class OpenAI extends BaseAdapter<

async *chatStream(
options: ChatOptions<string, OpenAIProviderOptions>,
): AsyncIterable<StreamChunk> {
// Check if audio output is requested via providerOptions
const providerOptions = options.providerOptions as
| ExternalTextProviderOptions
| undefined
const hasAudioOutput = providerOptions?.modalities?.includes('audio')

if (hasAudioOutput) {
// Use Chat Completions API for audio output
yield* this.chatStreamWithAudio(options)
} else {
// Use Responses API for standard text/tool use cases
yield* this.chatStreamWithResponses(options)
}
}

/**
* Stream using Chat Completions API with audio output support.
* Required for models like gpt-4o-audio-preview that support audio modalities.
*/
private async *chatStreamWithAudio(
options: ChatOptions<string, OpenAIProviderOptions>,
): AsyncIterable<StreamChunk> {
const providerOptions = options.providerOptions as
| ExternalTextProviderOptions
| undefined
const timestamp = Date.now()

try {
// Build Chat Completions API request
const messages = this.convertMessagesToChatCompletions(options.messages)

// Add system prompt if provided
if (options.systemPrompts && options.systemPrompts.length > 0) {
messages.unshift({
role: 'system',
content: options.systemPrompts.join('\n'),
})
}

const response = await this.client.chat.completions.create({
model: options.model,
messages,
modalities: providerOptions?.modalities,
audio: providerOptions?.audio,
temperature: options.options?.temperature,
max_completion_tokens: options.options?.maxTokens,
top_p: options.options?.topP,
stream: true,
})

let accumulatedContent = ''
let responseId = this.generateId()

for await (const chunk of response) {
const choice = chunk.choices[0]
if (!choice) continue

const delta = choice.delta as {
content?: string | null
audio?: { transcript?: string; data?: string }
}

// Handle text content delta
if (delta.content) {
accumulatedContent += delta.content
yield {
type: 'content',
id: responseId,
model: options.model,
timestamp,
delta: delta.content,
content: accumulatedContent,
role: 'assistant',
}
}

// Handle audio transcript (also counts as content)
if (delta.audio?.transcript) {
accumulatedContent += delta.audio.transcript
yield {
type: 'content',
id: responseId,
model: options.model,
timestamp,
delta: delta.audio.transcript,
content: accumulatedContent,
role: 'assistant',
}
}

// Handle audio data
if (delta.audio?.data) {
yield {
type: 'audio',
id: responseId,
model: options.model,
timestamp,
data: delta.audio.data,
transcript: delta.audio.transcript,
format: providerOptions?.audio?.format,
}
}

// Handle finish
if (choice.finish_reason) {
yield {
type: 'done',
id: responseId,
model: options.model,
timestamp,
finishReason:
choice.finish_reason === 'stop'
? 'stop'
: choice.finish_reason === 'length'
? 'length'
: choice.finish_reason === 'content_filter'
? 'content_filter'
: null,
usage: chunk.usage
? {
promptTokens: chunk.usage.prompt_tokens,
completionTokens: chunk.usage.completion_tokens,
totalTokens: chunk.usage.total_tokens,
}
: undefined,
}
}
}
} catch (error: any) {
console.error(
'>>> chatStreamWithAudio: Error during audio streaming <<<',
)
console.error('>>> Error message:', error?.message)
yield {
type: 'error',
id: this.generateId(),
model: options.model,
timestamp,
error: {
message: error.message || 'Unknown error occurred',
code: error.code,
},
}
}
}

/**
* Convert ModelMessage array to Chat Completions API format.
*/
private convertMessagesToChatCompletions(
messages: Array<ModelMessage>,
): Array<{ role: 'system' | 'user' | 'assistant'; content: string }> {
return messages
.filter((m) => m.role !== 'tool') // Filter out tool messages for now
.map((m) => ({
role: m.role as 'user' | 'assistant',
content: this.extractTextContent(m.content),
}))
}

/**
* Stream using Responses API (default for non-audio use cases).
*/
private async *chatStreamWithResponses(
options: ChatOptions<string, OpenAIProviderOptions>,
): AsyncIterable<StreamChunk> {
// Track tool call metadata by unique ID
// OpenAI streams tool calls with deltas - first chunk has ID/name, subsequent chunks only have args
Expand All @@ -108,7 +274,7 @@ export class OpenAI extends BaseAdapter<
},
)

// Chat Completions API uses SSE format - iterate directly
// Responses API uses SSE format - iterate directly
yield* this.processOpenAIStreamChunks(
response,
toolCallMetadata,
Expand Down
47 changes: 46 additions & 1 deletion packages/typescript/ai-openai/src/text/text-provider-options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,57 @@ https://platform.openai.com/docs/api-reference/responses/create#responses_create
metadata?: Record<string, string>
}

/**
* Audio output options for models that support audio generation (e.g., gpt-4o-audio-preview).
* When specified, the model will generate audio output in addition to or instead of text.
* Note: Audio output requires the Chat Completions API, not the Responses API.
* @see https://platform.openai.com/docs/guides/audio
*/
export interface OpenAIAudioOutputOptions {
/**
* Output modalities to generate. Include 'audio' to enable audio output.
* Example: ['text', 'audio'] for both text and audio output.
*/
modalities?: Array<'text' | 'audio'>
/**
* Audio output configuration. Required when modalities includes 'audio'.
*/
audio?: {
/**
* The voice to use for audio generation.
* Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse
*/
voice:
| 'alloy'
| 'ash'
| 'ballad'
| 'coral'
| 'echo'
| 'fable'
| 'onyx'
| 'nova'
| 'sage'
| 'shimmer'
| 'verse'
/**
* The audio format for output.
* - 'pcm16': Raw 16-bit PCM audio at 24kHz (best for real-time streaming)
* - 'wav': WAV format
* - 'mp3': MP3 format
* - 'opus': Opus format
* - 'flac': FLAC format
*/
format: 'pcm16' | 'wav' | 'mp3' | 'opus' | 'flac'
}
}

export type ExternalTextProviderOptions = OpenAIBaseOptions &
OpenAIReasoningOptions &
OpenAIStructuredOutputOptions &
OpenAIToolsOptions &
OpenAIStreamingOptions &
OpenAIMetadataOptions
OpenAIMetadataOptions &
OpenAIAudioOutputOptions

/**
* Options your SDK forwards to OpenAI when doing chat/responses.
Expand Down
16 changes: 16 additions & 0 deletions packages/typescript/ai/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ export interface ChatOptions<

export type StreamChunkType =
| 'content'
| 'audio'
| 'tool_call'
| 'tool_result'
| 'done'
Expand Down Expand Up @@ -666,11 +667,26 @@ export interface ThinkingStreamChunk extends BaseStreamChunk {
content: string // Full accumulated thinking content so far
}

/**
* Audio output stream chunk for models that support audio output (e.g., gpt-4o-audio-preview).
* Contains base64-encoded audio data and optional transcript.
*/
export interface AudioStreamChunk extends BaseStreamChunk {
type: 'audio'
/** Base64-encoded audio data (e.g., PCM16, WAV) */
data: string
/** Optional transcript of the audio content */
transcript?: string
/** Audio format (e.g., 'pcm16', 'wav', 'mp3') */
format?: string
}

/**
* Chunk returned by the sdk during streaming chat completions.
*/
export type StreamChunk =
| ContentStreamChunk
| AudioStreamChunk
| ToolCallStreamChunk
| ToolResultStreamChunk
| DoneStreamChunk
Expand Down