diff --git a/Sources/Core/Models/Item.swift b/Sources/Core/Models/Item.swift index 4f24a58..8ccea78 100644 --- a/Sources/Core/Models/Item.swift +++ b/Sources/Core/Models/Item.swift @@ -37,12 +37,14 @@ import MetaCodable case text(String) case audio(Audio) case inputText(String) + case inputImage(String) case inputAudio(Audio) public var text: String? { switch self { case let .text(text): text case let .inputText(text): text + case let .inputImage(image): image case let .audio(audio): audio.transcript case let .inputAudio(audio): audio.transcript } @@ -419,6 +421,7 @@ extension Item.Message.Content: Codable { case text case audio case transcript + case image_url } private struct Text: Codable { @@ -440,7 +443,10 @@ extension Item.Message.Content: Codable { case "input_text": let container = try decoder.container(keyedBy: Text.CodingKeys.self) self = try .inputText(container.decode(String.self, forKey: .text)) - case "output_audio": + case "input_image": + let inner = try decoder.container(keyedBy: CodingKeys.self) + self = try .inputImage(inner.decodeIfPresent(String.self, forKey: .image_url) ?? "") + case "output_audio": self = try .audio(Item.Audio(from: decoder)) case "input_audio": self = try .inputAudio(Item.Audio(from: decoder)) @@ -459,7 +465,10 @@ extension Item.Message.Content: Codable { case let .inputText(text): try container.encode(text, forKey: .text) try container.encode("input_text", forKey: .type) - case let .audio(audio): + case let .inputImage(imageURL): + try container.encode(imageURL, forKey: .image_url) + try container.encode("input_image", forKey: .type) + case let .audio(audio): try container.encode("output_audio", forKey: .type) try container.encode(audio.audio, forKey: .audio) try container.encode(audio.transcript, forKey: .transcript) diff --git a/Sources/Core/Models/ServerEvent.swift b/Sources/Core/Models/ServerEvent.swift index c8ff0af..2995c59 100644 --- a/Sources/Core/Models/ServerEvent.swift +++ b/Sources/Core/Models/ServerEvent.swift @@ -224,6 +224,11 @@ import MetaCodable @CodedAs("output_audio_buffer.stopped") case outputAudioBufferStopped(eventId: String, responseId: String) + /// Fired when the assistant’s output audio buffer is fully cleared. + /// No audio remains queued; the buffer is reset and ready for new output. + @CodedAs("output_audio_buffer.cleared") + case outputAudioBufferCleared(eventId: String, responseId: String) + /// Returned when a new Response is created. /// /// The first event of response creation, where the response is in an initial state of `inProgress`. @@ -548,6 +553,7 @@ extension ServerEvent: Identifiable { case let .inputAudioBufferTimeoutTriggered(id, _, _, _): id case let .outputAudioBufferStarted(id, _): id case let .outputAudioBufferStopped(id, _): id + case let .outputAudioBufferCleared(id, _): id case let .responseCreated(id, _): id case let .responseDone(id, _): id case let .responseOutputItemAdded(id, _, _, _): id diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift index 5820057..2c86a4f 100644 --- a/Sources/UI/Conversation.swift +++ b/Sources/UI/Conversation.swift @@ -2,6 +2,7 @@ import Core import WebRTC import AVFAudio import Foundation +import Observation public enum ConversationError: Error { case sessionNotFound @@ -13,7 +14,7 @@ public enum ConversationError: Error { public final class Conversation: @unchecked Sendable { public typealias SessionUpdateCallback = (inout Session) -> Void - private let client: WebRTCConnector + private let client: WebRTCConnector private var task: Task! private let sessionUpdateCallback: SessionUpdateCallback? private let errorStream: AsyncStream.Continuation @@ -152,11 +153,25 @@ public final class Conversation: @unchecked Sendable { /// Send a text message and wait for a response. /// Optionally, you can provide a response configuration to customize the model's behavior. public func send(from role: Item.Message.Role, text: String, response: Response.Config? = nil) throws { - try send(event: .createConversationItem(.message(Item.Message(id: String(randomLength: 32), role: role, content: [.inputText(text)])))) + let id = UUID().uuidString.replacingOccurrences(of: "-", with: "") // random 32 character string + try send(event: .createConversationItem(.message(Item.Message(id: id, role: role, content: [.inputText(text)])))) try send(event: .createResponse(using: response)) } - /// Send the response of a function call. + /// Send an image + text message and wait for a response. + public func send(from role: Item.Message.Role, image: Data, response: Response.Config? = nil) throws { + let dataURI = "data:image/jpeg;base64,\(image.base64EncodedString())" + let id = UUID().uuidString.replacingOccurrences(of: "-", with: "") // random 32 character string + let message = Item.Message( + id: id, + role: role, + content: [.inputImage(dataURI)] + ) + try send(event: .createConversationItem(.message(message))) + try send(event: .createResponse(using: response)) + } + + /// Send the response of a function call. public func send(result output: Item.FunctionCallOutput) throws { try send(event: .createConversationItem(.functionCallOutput(output))) } @@ -176,10 +191,15 @@ private extension Conversation { if let sessionUpdateCallback { try updateSession(withChanges: sessionUpdateCallback) } case let .sessionUpdated(_, session): self.session = session - case let .conversationItemCreated(_, item, _): + case let .conversationItemCreated(_, item, _), + let .conversationItemAdded(_, item, _): entries.append(item) case let .conversationItemDeleted(_, itemId): entries.removeAll { $0.id == itemId } + case let .conversationItemDone(_, item, _): + if let i = entries.firstIndex(where: { $0.id == item.id }) { + entries[i] = item + } case let .conversationItemInputAudioTranscriptionCompleted(_, itemId, contentIndex, transcript, _, _): updateEvent(id: itemId) { message in guard case let .inputAudio(audio) = message.content[contentIndex] else { return } diff --git a/Sources/UI/Extensions/String+random.swift b/Sources/UI/Extensions/String+random.swift deleted file mode 100644 index bad18c4..0000000 --- a/Sources/UI/Extensions/String+random.swift +++ /dev/null @@ -1,8 +0,0 @@ -import Foundation - -extension String { - init(randomLength length: Int) { - let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" - self = String((0..