diff --git a/sdk/typescript/src/ai/MultimodalResponse.ts b/sdk/typescript/src/ai/MultimodalResponse.ts new file mode 100644 index 00000000..07da87f3 --- /dev/null +++ b/sdk/typescript/src/ai/MultimodalResponse.ts @@ -0,0 +1,537 @@ +/** + * Multimodal response classes for handling LLM multimodal outputs. + * Provides seamless integration with audio, image, and file outputs while maintaining backward compatibility. + */ + +import { promises as fs } from 'fs'; +import * as path from 'path'; + +/** + * Represents image output from LLM with convenient access methods. + */ +export interface ImageOutput { + /** URL to image */ + url?: string; + /** Base64-encoded image data */ + b64Json?: string; + /** Revised prompt used for generation */ + revisedPrompt?: string; +} + +/** + * Represents audio output from LLM with convenient access methods. + */ +export interface AudioOutput { + /** Base64-encoded audio data */ + data?: string; + /** Audio format (wav, mp3, etc.) */ + format: string; + /** URL to audio file if available */ + url?: string; +} + +/** + * Represents generic file output from LLM. + */ +export interface FileOutput { + /** URL to file */ + url?: string; + /** Base64-encoded file data */ + data?: string; + /** MIME type of file */ + mimeType?: string; + /** Suggested filename */ + filename?: string; +} + +/** + * Enhanced response object that provides seamless access to multimodal content + * while maintaining backward compatibility with string responses. + */ +export class MultimodalResponse { + private _text: string; + private _audio: AudioOutput | null; + private _images: ImageOutput[]; + private _files: FileOutput[]; + private _rawResponse: unknown; + private _costUsd: number | null; + private _usage: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; + + constructor( + text: string = '', + audio: AudioOutput | null = null, + images: ImageOutput[] = [], + files: FileOutput[] = [], + rawResponse: unknown = null, + costUsd: number | null = null, + usage: { promptTokens?: number; completionTokens?: number; totalTokens?: number } = {} + ) { + this._text = text; + this._audio = audio; + this._images = images; + this._files = files; + this._rawResponse = rawResponse; + this._costUsd = costUsd; + this._usage = usage; + } + + /** + * Get text content. + */ + get text(): string { + return this._text; + } + + /** + * Get audio output if available. + */ + get audio(): AudioOutput | null { + return this._audio; + } + + /** + * Get list of image outputs. + */ + get images(): ImageOutput[] { + return this._images; + } + + /** + * Get list of file outputs. + */ + get files(): FileOutput[] { + return this._files; + } + + /** + * Check if response contains audio. + */ + hasAudio(): boolean { + return this._audio !== null; + } + + /** + * Check if response contains images. + */ + hasImage(): boolean { + return this._images.length > 0; + } + + /** + * Check if response contains files. + */ + hasFile(): boolean { + return this._files.length > 0; + } + + /** + * Check if response contains any multimodal content. + */ + isMultimodal(): boolean { + return this.hasAudio() || this.hasImage() || this.hasFile(); + } + + /** + * Get the raw LLM response object. + */ + get rawResponse(): unknown { + return this._rawResponse; + } + + /** + * Estimated cost of this LLM call in USD, if available. + */ + get costUsd(): number | null { + return this._costUsd; + } + + /** + * Token usage breakdown. + */ + get usage(): { promptTokens?: number; completionTokens?: number; totalTokens?: number } { + return this._usage; + } + + /** + * Get all images. + * Alias for images property for API consistency. + */ + getImages(): ImageOutput[] { + return this._images; + } + + /** + * Get audio. + * Alias for audio property for API consistency. + */ + getAudio(): AudioOutput | null { + return this._audio; + } + + /** + * Get all files. + * Alias for files property for API consistency. + */ + getFiles(): FileOutput[] { + return this._files; + } + + /** + * Get raw image bytes from an ImageOutput. + */ + private getImageBytes(image: ImageOutput): Uint8Array { + if (image.b64Json) { + const binaryString = atob(image.b64Json); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes; + } else if (image.url) { + // For data URLs + if (image.url.startsWith('data:')) { + const base64Data = image.url.split(',', 2)[1]; + if (base64Data) { + const binaryString = atob(base64Data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes; + } + } + throw new Error('URL download not implemented - provide b64Json instead'); + } + throw new Error('No image data available'); + } + + /** + * Get raw audio bytes from an AudioOutput. + */ + private getAudioBytes(audio: AudioOutput): Uint8Array { + if (audio.data) { + const binaryString = atob(audio.data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes; + } + throw new Error('No audio data available'); + } + + /** + * Get raw file bytes from a FileOutput. + */ + private getFileBytes(file: FileOutput): Uint8Array { + if (file.data) { + const binaryString = atob(file.data); + const bytes = new Uint8Array(binaryString.length); + for (let i = 0; i < binaryString.length; i++) { + bytes[i] = binaryString.charCodeAt(i); + } + return bytes; + } + throw new Error('No file data available'); + } + + /** + * Save a single image to file. + */ + async saveImage(image: ImageOutput, imagePath: string): Promise { + const bytes = this.getImageBytes(image); + await fs.mkdir(path.dirname(imagePath), { recursive: true }); + await fs.writeFile(imagePath, bytes); + } + + /** + * Save a single audio to file. + */ + async saveAudio(audio: AudioOutput, audioPath: string): Promise { + const bytes = this.getAudioBytes(audio); + await fs.mkdir(path.dirname(audioPath), { recursive: true }); + await fs.writeFile(audioPath, bytes); + } + + /** + * Save a single file to disk. + */ + async saveFile(file: FileOutput, filePath: string): Promise { + const bytes = this.getFileBytes(file); + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await fs.writeFile(filePath, bytes); + } + + /** + * Save all multimodal content to a directory. + * Returns a dict mapping content type to saved file paths. + */ + async save(outputDir: string, prefix: string = 'output'): Promise> { + const savedFiles: Record = {}; + + // Ensure output directory exists + await fs.mkdir(outputDir, { recursive: true }); + + // Save audio + if (this._audio) { + const audioPath = path.join(outputDir, `${prefix}_audio.${this._audio.format}`); + await this.saveAudio(this._audio, audioPath); + savedFiles['audio'] = audioPath; + } + + // Save images + for (let i = 0; i < this._images.length; i++) { + const image = this._images[i]; + // Determine extension from URL or default to png + let ext = 'png'; + if (image.url) { + const urlExt = path.extname(image.url).slice(1); + if (urlExt) ext = urlExt; + } + const imagePath = path.join(outputDir, `${prefix}_image_${i}.${ext}`); + await this.saveImage(image, imagePath); + savedFiles[`image_${i}`] = imagePath; + } + + // Save files + for (let i = 0; i < this._files.length; i++) { + const file = this._files[i]; + const filename = file.filename || `${prefix}_file_${i}`; + const filePath = path.join(outputDir, filename); + await this.saveFile(file, filePath); + savedFiles[`file_${i}`] = filePath; + } + + // Save text content + if (this._text) { + const textPath = path.join(outputDir, `${prefix}_text.txt`); + await fs.writeFile(textPath, this._text, 'utf-8'); + savedFiles['text'] = textPath; + } + + return savedFiles; + } + + /** + * String representation for backward compatibility. + */ + toString(): string { + return this._text; + } + + /** + * Developer-friendly representation. + */ + toJSON(): object { + const parts: string[] = []; + if (this._audio) parts.push(`audio=${this._audio.format}`); + if (this._images.length > 0) parts.push(`images=${this._images.length}`); + if (this._files.length > 0) parts.push(`files=${this._files.length}`); + return { + text: this._text, + audio: this._audio, + images: this._images, + files: this._files, + _debug: `MultimodalResponse(${parts.join(', ')})` + }; + } +} + +/** + * Create a MultimodalResponse from raw LLM response data. + * Handles multiple formats: OpenRouter, OpenAI, and generic patterns. + */ +export function createMultimodalResponse( + rawResponse: unknown, + text: string = '' +): MultimodalResponse { + let audio: AudioOutput | null = null; + let images: ImageOutput[] = []; + let files: FileOutput[] = []; + + // Extract images from response + const extractedImages = extractImages(rawResponse); + if (extractedImages.length > 0) { + images = extractedImages; + } + + // Extract audio from response + const extractedAudio = extractAudio(rawResponse); + if (extractedAudio) { + audio = extractedAudio; + } + + // Extract files from response + const extractedFiles = extractFiles(rawResponse); + if (extractedFiles.length > 0) { + files = extractedFiles; + } + + return new MultimodalResponse(text, audio, images, files, rawResponse); +} + +/** + * Extract images from various data structures. + */ +function extractImages(data: unknown): ImageOutput[] { + const images: ImageOutput[] = []; + + if (!data) return images; + + // Handle array of images + if (Array.isArray(data)) { + for (const item of data) { + const extracted = extractSingleImage(item); + if (extracted) images.push(extracted); + } + return images; + } + + // Handle object with image data + if (typeof data === 'object') { + const obj = data as Record; + + // Check for images array + if (Array.isArray(obj.images)) { + for (const img of obj.images) { + const extracted = extractSingleImage(img); + if (extracted) images.push(extracted); + } + } + + // Check for image_url structure (OpenRouter pattern) + if (obj.image_url) { + const extracted = extractSingleImage(obj.image_url); + if (extracted) images.push(extracted); + } + + // Check for direct url/b64_json + if (obj.url || obj.b64_json || obj.b64Json) { + const extracted = extractSingleImage(obj); + if (extracted) images.push(extracted); + } + } + + return images; +} + +/** + * Extract a single image from data. + */ +function extractSingleImage(data: unknown): ImageOutput | null { + if (!data) return null; + + if (typeof data === 'string') { + // Direct URL string + if (data.startsWith('http') || data.startsWith('data:')) { + return { url: data }; + } + return null; + } + + if (typeof data === 'object') { + const obj = data as Record; + + // Handle image_url nested structure + if (obj.image_url) { + const imageUrl = obj.image_url as Record; + const url = imageUrl.url as string | undefined; + if (url) { + // Handle data URLs + if (url.startsWith('data:image')) { + const base64Data = url.split(',', 2)[1]; + return { url, b64Json: base64Data || undefined }; + } + return { url }; + } + } + + // Direct url/b64_json + const url = (obj.url || obj.image_url) as string | undefined; + const b64Json = (obj.b64_json || obj.b64Json) as string | undefined; + const revisedPrompt = (obj.revised_prompt || obj.revisedPrompt) as string | undefined; + + if (url || b64Json) { + return { url, b64Json, revisedPrompt }; + } + } + + return null; +} + +/** + * Extract audio from response data. + */ +function extractAudio(data: unknown): AudioOutput | null { + if (!data || typeof data !== 'object') return null; + + const obj = data as Record; + + // Check for audio object + if (obj.audio && typeof obj.audio === 'object') { + const audio = obj.audio as Record; + return { + data: audio.data as string | undefined, + format: (audio.format as string) || 'wav', + url: audio.url as string | undefined + }; + } + + // Check for input_audio structure + if (obj.input_audio && typeof obj.input_audio === 'object') { + const inputAudio = obj.input_audio as Record; + return { + data: inputAudio.data as string | undefined, + format: (inputAudio.format as string) || 'wav' + }; + } + + return null; +} + +/** + * Extract files from response data. + */ +function extractFiles(data: unknown): FileOutput[] { + const files: FileOutput[] = []; + + if (!data) return files; + + if (typeof data === 'object') { + const obj = data as Record; + + // Check for files array + if (Array.isArray(obj.files)) { + for (const file of obj.files) { + const extracted = extractSingleFile(file); + if (extracted) files.push(extracted); + } + } + } + + return files; +} + +/** + * Extract a single file from data. + */ +function extractSingleFile(data: unknown): FileOutput | null { + if (!data || typeof data !== 'object') return null; + + const obj = data as Record; + + const url = obj.url as string | undefined; + const data_b64 = obj.data as string | undefined; + const mimeType = (obj.mime_type || obj.mimeType) as string | undefined; + const filename = obj.filename as string | undefined; + + if (url || data_b64) { + return { url, data: data_b64, mimeType, filename }; + } + + return null; +} diff --git a/sdk/typescript/src/ai/multimodal.ts b/sdk/typescript/src/ai/multimodal.ts new file mode 100644 index 00000000..e3bb0ecb --- /dev/null +++ b/sdk/typescript/src/ai/multimodal.ts @@ -0,0 +1,405 @@ +/** + * Multimodal content helpers for AI prompts. + * Provides Image, Audio, and File classes with factory methods for creating + * multimodal content from various sources (files, URLs, buffers, base64). + */ + +import { readFile } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +// MIME type mappings for common image formats +const IMAGE_MIME_TYPES: Record = { + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.gif': 'image/gif', + '.webp': 'image/webp', + '.bmp': 'image/bmp', +}; + +// MIME type mappings for common audio formats +const AUDIO_MIME_TYPES: Record = { + '.wav': 'audio/wav', + '.mp3': 'audio/mpeg', + '.flac': 'audio/flac', + '.ogg': 'audio/ogg', +}; + +/** + * Represents text content in a multimodal prompt. + */ +export class Text { + readonly type: 'text' = 'text'; + readonly text: string; + + constructor(text: string) { + this.text = text; + } +} + +/** + * Represents image content in a multimodal prompt. + */ +export class Image { + readonly type: 'image_url' = 'image_url'; + readonly imageUrl: { url: string; detail?: 'low' | 'high' | 'auto' }; + + private constructor(imageUrl: { url: string; detail?: 'low' | 'high' | 'auto' }) { + this.imageUrl = imageUrl; + } + + /** + * Create Image from a local file by converting to base64 data URL. + */ + static async fromFile( + filePath: string, + detail: 'low' | 'high' | 'auto' = 'high' + ): Promise { + const absolutePath = resolve(filePath); + + // Read file and encode to base64 + const buffer = await readFile(absolutePath); + const base64Data = buffer.toString('base64'); + + // Determine MIME type from extension + const ext = getExtension(absolutePath).toLowerCase(); + const mimeType = IMAGE_MIME_TYPES[ext] || 'image/jpeg'; + + const dataUrl = `data:${mimeType};base64,${base64Data}`; + return new Image({ url: dataUrl, detail }); + } + + /** + * Create Image from a URL. + */ + static async fromUrl( + url: string, + detail: 'low' | 'high' | 'auto' = 'high' + ): Promise { + return new Image({ url, detail }); + } + + /** + * Create Image from a buffer. + */ + static async fromBuffer( + buffer: Buffer | Uint8Array, + mimeType: string = 'image/jpeg', + detail: 'low' | 'high' | 'auto' = 'high' + ): Promise { + const base64Data = Buffer.from(buffer).toString('base64'); + const dataUrl = `data:${mimeType};base64,${base64Data}`; + return new Image({ url: dataUrl, detail }); + } + + /** + * Create Image from a base64 string. + */ + static async fromBase64( + base64Data: string, + mimeType: string = 'image/jpeg', + detail: 'low' | 'high' | 'auto' = 'high' + ): Promise { + const dataUrl = `data:${mimeType};base64,${base64Data}`; + return new Image({ url: dataUrl, detail }); + } +} + +/** + * Represents audio content in a multimodal prompt. + */ +export class Audio { + readonly type: 'input_audio' = 'input_audio'; + readonly audio: { data: string; format: string }; + + private constructor(audio: { data: string; format: string }) { + this.audio = audio; + } + + /** + * Create Audio from a local file by converting to base64. + */ + static async fromFile( + filePath: string, + format?: 'wav' | 'mp3' | 'flac' | 'ogg' + ): Promise