diff --git a/packages/typescript/ai-openai/src/model-meta.ts b/packages/typescript/ai-openai/src/model-meta.ts index 024a6c1a..13c38c86 100644 --- a/packages/typescript/ai-openai/src/model-meta.ts +++ b/packages/typescript/ai-openai/src/model-meta.ts @@ -1,4 +1,5 @@ import type { + OpenAIAudioOutputOptions, OpenAIBaseOptions, OpenAIMetadataOptions, OpenAIReasoningOptions, @@ -1072,6 +1073,32 @@ const GPT_4O_AUDIO = { OpenAIMetadataOptions > +const GPT_4O_AUDIO_PREVIEW = { + name: 'gpt-4o-audio-preview', + context_window: 128_000, + max_output_tokens: 16_384, + knowledge_cutoff: '2023-10-01', + pricing: { + input: { + normal: 2.5, + }, + output: { + normal: 10, + }, + }, + supports: { + input: ['text', 'audio'], + output: ['text', 'audio'], + endpoints: ['chat-completions'], + features: ['streaming', 'function_calling'], + }, +} as const satisfies ModelMeta< + OpenAIBaseOptions & + OpenAIToolsOptions & + OpenAIStreamingOptions & + OpenAIMetadataOptions +> + const GPT_4O_MINI = { name: 'gpt-4o-mini', context_window: 128_000, @@ -1635,6 +1662,7 @@ export const OPENAI_CHAT_MODELS = [ GPT_AUDIO.name, GPT_AUDIO_MINI.name, GPT_4O_AUDIO.name, + GPT_4O_AUDIO_PREVIEW.name, GPT_4O_MINI_AUDIO.name, // ChatGPT models GPT_5_1_CHAT.name, @@ -1879,16 +1907,24 @@ export type OpenAIChatModelProviderOptionsByName = { // Audio models [GPT_AUDIO.name]: OpenAIBaseOptions & OpenAIStreamingOptions & - OpenAIMetadataOptions + OpenAIMetadataOptions & + OpenAIAudioOutputOptions [GPT_AUDIO_MINI.name]: OpenAIBaseOptions & OpenAIStreamingOptions & - OpenAIMetadataOptions + OpenAIMetadataOptions & + OpenAIAudioOutputOptions [GPT_4O_AUDIO.name]: OpenAIBaseOptions & OpenAIStreamingOptions & - OpenAIMetadataOptions + OpenAIMetadataOptions & + OpenAIAudioOutputOptions + [GPT_4O_AUDIO_PREVIEW.name]: OpenAIBaseOptions & + OpenAIStreamingOptions & + OpenAIMetadataOptions & + OpenAIAudioOutputOptions [GPT_4O_MINI_AUDIO.name]: OpenAIBaseOptions & OpenAIStreamingOptions & - OpenAIMetadataOptions + OpenAIMetadataOptions & + OpenAIAudioOutputOptions // Chat-only models [GPT_5_1_CHAT.name]: OpenAIBaseOptions & @@ -1974,6 +2010,7 @@ export type OpenAIModelInputModalitiesByName = { [GPT_AUDIO.name]: typeof GPT_AUDIO.supports.input [GPT_AUDIO_MINI.name]: typeof GPT_AUDIO_MINI.supports.input [GPT_4O_AUDIO.name]: typeof GPT_4O_AUDIO.supports.input + [GPT_4O_AUDIO_PREVIEW.name]: typeof GPT_4O_AUDIO_PREVIEW.supports.input [GPT_4O_MINI_AUDIO.name]: typeof GPT_4O_MINI_AUDIO.supports.input // Text-only models diff --git a/packages/typescript/ai-openai/src/openai-adapter.ts b/packages/typescript/ai-openai/src/openai-adapter.ts index 676d5257..b5df957e 100644 --- a/packages/typescript/ai-openai/src/openai-adapter.ts +++ b/packages/typescript/ai-openai/src/openai-adapter.ts @@ -89,6 +89,172 @@ export class OpenAI extends BaseAdapter< async *chatStream( options: ChatOptions, + ): AsyncIterable { + // Check if audio output is requested via providerOptions + const providerOptions = options.providerOptions as + | ExternalTextProviderOptions + | undefined + const hasAudioOutput = providerOptions?.modalities?.includes('audio') + + if (hasAudioOutput) { + // Use Chat Completions API for audio output + yield* this.chatStreamWithAudio(options) + } else { + // Use Responses API for standard text/tool use cases + yield* this.chatStreamWithResponses(options) + } + } + + /** + * Stream using Chat Completions API with audio output support. + * Required for models like gpt-4o-audio-preview that support audio modalities. + */ + private async *chatStreamWithAudio( + options: ChatOptions, + ): AsyncIterable { + const providerOptions = options.providerOptions as + | ExternalTextProviderOptions + | undefined + const timestamp = Date.now() + + try { + // Build Chat Completions API request + const messages = this.convertMessagesToChatCompletions(options.messages) + + // Add system prompt if provided + if (options.systemPrompts && options.systemPrompts.length > 0) { + messages.unshift({ + role: 'system', + content: options.systemPrompts.join('\n'), + }) + } + + const response = await this.client.chat.completions.create({ + model: options.model, + messages, + modalities: providerOptions?.modalities, + audio: providerOptions?.audio, + temperature: options.options?.temperature, + max_completion_tokens: options.options?.maxTokens, + top_p: options.options?.topP, + stream: true, + }) + + let accumulatedContent = '' + let responseId = this.generateId() + + for await (const chunk of response) { + const choice = chunk.choices[0] + if (!choice) continue + + const delta = choice.delta as { + content?: string | null + audio?: { transcript?: string; data?: string } + } + + // Handle text content delta + if (delta.content) { + accumulatedContent += delta.content + yield { + type: 'content', + id: responseId, + model: options.model, + timestamp, + delta: delta.content, + content: accumulatedContent, + role: 'assistant', + } + } + + // Handle audio transcript (also counts as content) + if (delta.audio?.transcript) { + accumulatedContent += delta.audio.transcript + yield { + type: 'content', + id: responseId, + model: options.model, + timestamp, + delta: delta.audio.transcript, + content: accumulatedContent, + role: 'assistant', + } + } + + // Handle audio data + if (delta.audio?.data) { + yield { + type: 'audio', + id: responseId, + model: options.model, + timestamp, + data: delta.audio.data, + transcript: delta.audio.transcript, + format: providerOptions?.audio?.format, + } + } + + // Handle finish + if (choice.finish_reason) { + yield { + type: 'done', + id: responseId, + model: options.model, + timestamp, + finishReason: + choice.finish_reason === 'stop' + ? 'stop' + : choice.finish_reason === 'length' + ? 'length' + : choice.finish_reason === 'content_filter' + ? 'content_filter' + : null, + usage: chunk.usage + ? { + promptTokens: chunk.usage.prompt_tokens, + completionTokens: chunk.usage.completion_tokens, + totalTokens: chunk.usage.total_tokens, + } + : undefined, + } + } + } + } catch (error: any) { + console.error( + '>>> chatStreamWithAudio: Error during audio streaming <<<', + ) + console.error('>>> Error message:', error?.message) + yield { + type: 'error', + id: this.generateId(), + model: options.model, + timestamp, + error: { + message: error.message || 'Unknown error occurred', + code: error.code, + }, + } + } + } + + /** + * Convert ModelMessage array to Chat Completions API format. + */ + private convertMessagesToChatCompletions( + messages: Array, + ): Array<{ role: 'system' | 'user' | 'assistant'; content: string }> { + return messages + .filter((m) => m.role !== 'tool') // Filter out tool messages for now + .map((m) => ({ + role: m.role as 'user' | 'assistant', + content: this.extractTextContent(m.content), + })) + } + + /** + * Stream using Responses API (default for non-audio use cases). + */ + private async *chatStreamWithResponses( + options: ChatOptions, ): AsyncIterable { // Track tool call metadata by unique ID // OpenAI streams tool calls with deltas - first chunk has ID/name, subsequent chunks only have args @@ -108,7 +274,7 @@ export class OpenAI extends BaseAdapter< }, ) - // Chat Completions API uses SSE format - iterate directly + // Responses API uses SSE format - iterate directly yield* this.processOpenAIStreamChunks( response, toolCallMetadata, diff --git a/packages/typescript/ai-openai/src/text/text-provider-options.ts b/packages/typescript/ai-openai/src/text/text-provider-options.ts index ba9d6049..78ccacbb 100644 --- a/packages/typescript/ai-openai/src/text/text-provider-options.ts +++ b/packages/typescript/ai-openai/src/text/text-provider-options.ts @@ -234,12 +234,57 @@ https://platform.openai.com/docs/api-reference/responses/create#responses_create metadata?: Record } +/** + * Audio output options for models that support audio generation (e.g., gpt-4o-audio-preview). + * When specified, the model will generate audio output in addition to or instead of text. + * Note: Audio output requires the Chat Completions API, not the Responses API. + * @see https://platform.openai.com/docs/guides/audio + */ +export interface OpenAIAudioOutputOptions { + /** + * Output modalities to generate. Include 'audio' to enable audio output. + * Example: ['text', 'audio'] for both text and audio output. + */ + modalities?: Array<'text' | 'audio'> + /** + * Audio output configuration. Required when modalities includes 'audio'. + */ + audio?: { + /** + * The voice to use for audio generation. + * Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, verse + */ + voice: + | 'alloy' + | 'ash' + | 'ballad' + | 'coral' + | 'echo' + | 'fable' + | 'onyx' + | 'nova' + | 'sage' + | 'shimmer' + | 'verse' + /** + * The audio format for output. + * - 'pcm16': Raw 16-bit PCM audio at 24kHz (best for real-time streaming) + * - 'wav': WAV format + * - 'mp3': MP3 format + * - 'opus': Opus format + * - 'flac': FLAC format + */ + format: 'pcm16' | 'wav' | 'mp3' | 'opus' | 'flac' + } +} + export type ExternalTextProviderOptions = OpenAIBaseOptions & OpenAIReasoningOptions & OpenAIStructuredOutputOptions & OpenAIToolsOptions & OpenAIStreamingOptions & - OpenAIMetadataOptions + OpenAIMetadataOptions & + OpenAIAudioOutputOptions /** * Options your SDK forwards to OpenAI when doing chat/responses. diff --git a/packages/typescript/ai/src/types.ts b/packages/typescript/ai/src/types.ts index 748526c2..3d2f16e3 100644 --- a/packages/typescript/ai/src/types.ts +++ b/packages/typescript/ai/src/types.ts @@ -583,6 +583,7 @@ export interface ChatOptions< export type StreamChunkType = | 'content' + | 'audio' | 'tool_call' | 'tool_result' | 'done' @@ -666,11 +667,26 @@ export interface ThinkingStreamChunk extends BaseStreamChunk { content: string // Full accumulated thinking content so far } +/** + * Audio output stream chunk for models that support audio output (e.g., gpt-4o-audio-preview). + * Contains base64-encoded audio data and optional transcript. + */ +export interface AudioStreamChunk extends BaseStreamChunk { + type: 'audio' + /** Base64-encoded audio data (e.g., PCM16, WAV) */ + data: string + /** Optional transcript of the audio content */ + transcript?: string + /** Audio format (e.g., 'pcm16', 'wav', 'mp3') */ + format?: string +} + /** * Chunk returned by the sdk during streaming chat completions. */ export type StreamChunk = | ContentStreamChunk + | AudioStreamChunk | ToolCallStreamChunk | ToolResultStreamChunk | DoneStreamChunk