diff --git a/examples/live/live_prompt b/examples/live/live_prompt new file mode 100755 index 00000000..0b2012e9 Binary files /dev/null and b/examples/live/live_prompt differ diff --git a/examples/live/live_prompt.go b/examples/live/live_prompt.go new file mode 100644 index 00000000..490c2e17 --- /dev/null +++ b/examples/live/live_prompt.go @@ -0,0 +1,198 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build ignore_vet + +package main + +import ( + "context" + "encoding/binary" + "flag" + "fmt" + "io" + "log" + "os" + + "google.golang.org/genai" +) + +var voiceSample = flag.String("voice-sample", "", "Path to voice sample file") +var voiceConsent = flag.String("voice-consent", "", "Path to voice consent file") +var voiceSignature = flag.String("voice-signature", "", "Voice consent signature") +var modelFlag = flag.String("model", "", "Model name") +var promptFlag = flag.String("prompt", "Hello Gemini, are you there?", "Text prompt for testing") + +func main() { + flag.Parse() + log.SetFlags(0) + + if *promptFlag == "" { + log.Fatal("--prompt must be specified") + } + + var voiceSampleAudio []byte + var consentAudio []byte + + if *voiceSample != "" { + var err error + voiceSampleAudio, err = os.ReadFile(*voiceSample) + if err != nil { + log.Fatal("read voice sample error: ", err) + } + if *voiceConsent != "" { + consentAudio, err = os.ReadFile(*voiceConsent) + if err != nil { + log.Fatal("read voice consent error: ", err) + } + } + if len(consentAudio) == 0 && *voiceSignature == "" { + log.Fatal("Either --voice-consent or --voice-signature must be provided when --voice-sample is used.") + } + } + + ctx := context.Background() + client, err := genai.NewClient(ctx, nil) + if err != nil { + log.Fatal("create client error: ", err) + } + + var model string + if *modelFlag != "" { + model = *modelFlag + } else if client.ClientConfig().Backend == genai.BackendVertexAI { + model = "gemini-2.0-flash-live-preview-04-09" + } else { + model = "gemini-live-2.5-flash-preview" + } + + config := &genai.LiveConnectConfig{} + config.ResponseModalities = []genai.Modality{genai.ModalityAudio} + + if len(voiceSampleAudio) > 0 { + replicatedConfig := &genai.ReplicatedVoiceConfig{ + MIMEType: "audio/wav", + VoiceSampleAudio: voiceSampleAudio, + } + if len(consentAudio) > 0 { + replicatedConfig.ConsentAudio = consentAudio + } + if *voiceSignature != "" { + replicatedConfig.VoiceConsentSignature = &genai.VoiceConsentSignature{ + Signature: *voiceSignature, + } + } + config.SpeechConfig = &genai.SpeechConfig{ + VoiceConfig: &genai.VoiceConfig{ + ReplicatedVoiceConfig: replicatedConfig, + }, + } + } + + session, err := client.Live.Connect(ctx, model, config) + if err != nil { + log.Fatal("connect to model error: ", err) + } + defer session.Close() + + // Read SetupComplete + setupMsg, err := session.Receive() + if err != nil { + log.Fatal("receive setup complete error: ", err) + } + if setupMsg.SetupComplete != nil && setupMsg.SetupComplete.VoiceConsentSignature != nil { + log.Printf("\n=== Voice Consent Signature Received ===\n%s\n========================================\n", setupMsg.SetupComplete.VoiceConsentSignature.Signature) + } + + fmt.Println("Sending prompt:", *promptFlag) + err = session.SendRealtimeInput(genai.LiveRealtimeInput{ + Text: *promptFlag, + }) + if err != nil { + log.Fatal("send prompt error: ", err) + } + + var audioData []byte + for { + msg, err := session.Receive() + if err == io.EOF { + break + } + if err != nil { + log.Fatal("receive error: ", err) + } + + if msg.ServerContent != nil { + content := msg.ServerContent + if content.TurnComplete { + break + } + if content.ModelTurn != nil { + for _, part := range content.ModelTurn.Parts { + if part.InlineData != nil && part.InlineData.Data != nil { + audioData = append(audioData, part.InlineData.Data...) + fmt.Printf("Received audio chunk: %d bytes\n", len(part.InlineData.Data)) + } + } + } + } + } + + if len(audioData) > 0 { + err = saveWav(audioData, "output.wav") + if err != nil { + log.Fatal("save wav error: ", err) + } + } else { + fmt.Println("No audio data received.") + } +} + +func saveWav(data []byte, filename string) error { + f, err := os.Create(filename) + if err != nil { + return err + } + defer f.Close() + + // WAV header + // Assume 24kHz, 16-bit, mono as per ReplicatedVoiceConfig spec. + sampleRate := uint32(24000) + bitsPerSample := uint16(16) + channels := uint16(1) + byteRate := sampleRate * uint32(channels) * uint32(bitsPerSample) / 8 + + // RIFF header + f.Write([]byte("RIFF")) + binary.Write(f, binary.LittleEndian, uint32(36+len(data))) + f.Write([]byte("WAVE")) + + // fmt chunk + f.Write([]byte("fmt ")) + binary.Write(f, binary.LittleEndian, uint32(16)) + binary.Write(f, binary.LittleEndian, uint16(1)) // PCM + binary.Write(f, binary.LittleEndian, channels) + binary.Write(f, binary.LittleEndian, sampleRate) + binary.Write(f, binary.LittleEndian, byteRate) + binary.Write(f, binary.LittleEndian, uint16(channels*bitsPerSample/8)) + binary.Write(f, binary.LittleEndian, bitsPerSample) + + // data chunk + f.Write([]byte("data")) + binary.Write(f, binary.LittleEndian, uint32(len(data))) + f.Write(data) + + fmt.Println("Saved audio response to", filename) + return nil +} diff --git a/examples/live/live_streaming_server b/examples/live/live_streaming_server new file mode 100755 index 00000000..6277a2de Binary files /dev/null and b/examples/live/live_streaming_server differ diff --git a/examples/live/live_streaming_server.go b/examples/live/live_streaming_server.go index 6881f60d..9bf4a145 100644 --- a/examples/live/live_streaming_server.go +++ b/examples/live/live_streaming_server.go @@ -34,6 +34,13 @@ import ( ) var addr = flag.String("addr", "localhost:8080", "http service address") +var voiceSample = flag.String("voice-sample", "", "Path to voice sample file") +var voiceConsent = flag.String("voice-consent", "", "Path to voice consent file") +var voiceSignature = flag.String("voice-signature", "", "Voice consent signature") +var modelFlag = flag.String("model", "", "Model name") + +var voiceSampleAudio []byte +var consentAudio []byte var upgrader = websocket.Upgrader{} // use default options @@ -66,18 +73,61 @@ func live(w http.ResponseWriter, r *http.Request) { } var model string - if client.ClientConfig().Backend == genai.BackendVertexAI { + if *modelFlag != "" { + model = *modelFlag + } else if client.ClientConfig().Backend == genai.BackendVertexAI { model = "gemini-2.0-flash-live-preview-04-09" } else { model = "gemini-live-2.5-flash-preview" } + config := &genai.LiveConnectConfig{} + config.ResponseModalities = []genai.Modality{genai.ModalityAudio} + if len(voiceSampleAudio) > 0 { + replicatedConfig := &genai.ReplicatedVoiceConfig{ + MIMEType: "audio/wav", + VoiceSampleAudio: voiceSampleAudio, + } + if len(consentAudio) > 0 { + replicatedConfig.ConsentAudio = consentAudio + } + if *voiceSignature != "" { + replicatedConfig.VoiceConsentSignature = &genai.VoiceConsentSignature{ + Signature: *voiceSignature, + } + } + config.SpeechConfig = &genai.SpeechConfig{ + VoiceConfig: &genai.VoiceConfig{ + ReplicatedVoiceConfig: replicatedConfig, + }, + } + } + // Establish the live WebSocket connection with the specified GenAI model. - session, err := client.Live.Connect(ctx, model, &genai.LiveConnectConfig{}) + session, err := client.Live.Connect(ctx, model, config) if err != nil { // Log fatal error if connecting to the model fails (e.g., network issues, invalid model name). log.Fatal("connect to model error: ", err) } + + // Read the first message which should be SetupComplete + setupMsg, err := session.Receive() + if err != nil { + log.Fatal("receive setup complete error: ", err) + } + if setupMsg.SetupComplete != nil && setupMsg.SetupComplete.VoiceConsentSignature != nil { + log.Printf("\n=== Voice Consent Signature Received ===\n%s\n========================================\n", setupMsg.SetupComplete.VoiceConsentSignature.Signature) + } + + // Forward SetupComplete to client + setupBytes, err := json.Marshal(setupMsg) + if err != nil { + log.Fatal("marshal setup complete error: ", err) + } + err = c.WriteMessage(websocket.TextMessage, setupBytes) + if err != nil { + log.Println("write setup complete error: ", err) + } defer session.Close() // Ensure session is closed when the handler exits // Goroutine to receive messages from the GenAI service and send to the client @@ -176,6 +226,24 @@ func proxyVideo(w http.ResponseWriter, r *http.Request) { func main() { flag.Parse() log.SetFlags(0) + + if *voiceSample != "" { + var err error + voiceSampleAudio, err = os.ReadFile(*voiceSample) + if err != nil { + log.Fatal("read voice sample error: ", err) + } + if *voiceConsent != "" { + consentAudio, err = os.ReadFile(*voiceConsent) + if err != nil { + log.Fatal("read voice consent error: ", err) + } + } + if len(consentAudio) == 0 && *voiceSignature == "" { + log.Fatal("Either --voice-consent or --voice-signature must be provided when --voice-sample is used.") + } + } + http.HandleFunc("/", homePage) http.HandleFunc("/live", live) http.HandleFunc("/proxyVideo", proxyVideo) @@ -194,3 +262,4 @@ func main() { log.Fatal(err) } } + diff --git a/examples/live/output.wav b/examples/live/output.wav new file mode 100644 index 00000000..31a5615c Binary files /dev/null and b/examples/live/output.wav differ diff --git a/live.go b/live.go index a3439bfa..c38cc02d 100644 --- a/live.go +++ b/live.go @@ -157,6 +157,7 @@ func (r *Live) Connect(context context.Context, model string, config *LiveConnec if err != nil { return nil, fmt.Errorf("failed to write LiveClientSetup: %w", err) } + return s, nil } diff --git a/live_converters.go b/live_converters.go index de655266..ecd00dcf 100644 --- a/live_converters.go +++ b/live_converters.go @@ -595,6 +595,11 @@ func liveConnectConfigToVertex(fromObject map[string]any, parentObject map[strin return nil, err } + fromSpeechConfig, err = speechConfigToVertex(fromSpeechConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + InternalSetValueByPath(parentObject, []string{"setup", "generationConfig", "speechConfig"}, fromSpeechConfig) } diff --git a/live_test.go b/live_test.go index f8e21dce..e9973bca 100644 --- a/live_test.go +++ b/live_test.go @@ -311,6 +311,7 @@ func TestLiveConnect(t *testing.T) { t.Errorf("Connect() error message = %v, wantErrMessage %v", err.Error(), tt.wantErrMessage) return } + defer session.Close() }) } diff --git a/models.go b/models.go index 9ff5a7bd..d3272cc4 100644 --- a/models.go +++ b/models.go @@ -1493,6 +1493,11 @@ func generateContentConfigToVertex(ac *InternalAPIClient, fromObject map[string] return nil, err } + fromSpeechConfig, err = speechConfigToVertex(fromSpeechConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + InternalSetValueByPath(toObject, []string{"speechConfig"}, fromSpeechConfig) } @@ -2720,6 +2725,11 @@ func generationConfigToVertex(fromObject map[string]any, parentObject map[string fromSpeechConfig := InternalGetValueByPath(fromObject, []string{"speechConfig"}) if fromSpeechConfig != nil { + fromSpeechConfig, err = speechConfigToVertex(fromSpeechConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + InternalSetValueByPath(toObject, []string{"speechConfig"}, fromSpeechConfig) } @@ -3311,6 +3321,22 @@ func modelFromVertex(fromObject map[string]any, parentObject map[string]any, roo return toObject, nil } +func multiSpeakerVoiceConfigToVertex(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { + toObject = make(map[string]any) + + fromSpeakerVoiceConfigs := InternalGetValueByPath(fromObject, []string{"speakerVoiceConfigs"}) + if fromSpeakerVoiceConfigs != nil { + fromSpeakerVoiceConfigs, err = InternalApplyConverterToSliceWithRoot(fromSpeakerVoiceConfigs.([]any), speakerVoiceConfigToVertex, rootObject) + if err != nil { + return nil, err + } + + InternalSetValueByPath(toObject, []string{"speakerVoiceConfigs"}, fromSpeakerVoiceConfigs) + } + + return toObject, nil +} + func partToMldev(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { toObject = make(map[string]any) @@ -3687,6 +3713,30 @@ func referenceImageAPIToVertex(fromObject map[string]any, parentObject map[strin return toObject, nil } +func replicatedVoiceConfigToVertex(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { + toObject = make(map[string]any) + + fromMimeType := InternalGetValueByPath(fromObject, []string{"mimeType"}) + if fromMimeType != nil { + InternalSetValueByPath(toObject, []string{"mimeType"}, fromMimeType) + } + + fromVoiceSampleAudio := InternalGetValueByPath(fromObject, []string{"voiceSampleAudio"}) + if fromVoiceSampleAudio != nil { + InternalSetValueByPath(toObject, []string{"voiceSampleAudio"}, fromVoiceSampleAudio) + } + + if InternalGetValueByPath(fromObject, []string{"consentAudio"}) != nil { + return nil, fmt.Errorf("consentAudio parameter is not supported in Gemini Enterprise Agent Platform (previously known as Vertex AI)") + } + + if InternalGetValueByPath(fromObject, []string{"voiceConsentSignature"}) != nil { + return nil, fmt.Errorf("voiceConsentSignature parameter is not supported in Gemini Enterprise Agent Platform (previously known as Vertex AI)") + } + + return toObject, nil +} + func safetyAttributesFromMldev(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { toObject = make(map[string]any) @@ -3880,6 +3930,58 @@ func segmentImageSourceToVertex(fromObject map[string]any, parentObject map[stri return toObject, nil } +func speakerVoiceConfigToVertex(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { + toObject = make(map[string]any) + + fromSpeaker := InternalGetValueByPath(fromObject, []string{"speaker"}) + if fromSpeaker != nil { + InternalSetValueByPath(toObject, []string{"speaker"}, fromSpeaker) + } + + fromVoiceConfig := InternalGetValueByPath(fromObject, []string{"voiceConfig"}) + if fromVoiceConfig != nil { + fromVoiceConfig, err = voiceConfigToVertex(fromVoiceConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + + InternalSetValueByPath(toObject, []string{"voiceConfig"}, fromVoiceConfig) + } + + return toObject, nil +} + +func speechConfigToVertex(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { + toObject = make(map[string]any) + + fromVoiceConfig := InternalGetValueByPath(fromObject, []string{"voiceConfig"}) + if fromVoiceConfig != nil { + fromVoiceConfig, err = voiceConfigToVertex(fromVoiceConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + + InternalSetValueByPath(toObject, []string{"voiceConfig"}, fromVoiceConfig) + } + + fromLanguageCode := InternalGetValueByPath(fromObject, []string{"languageCode"}) + if fromLanguageCode != nil { + InternalSetValueByPath(toObject, []string{"languageCode"}, fromLanguageCode) + } + + fromMultiSpeakerVoiceConfig := InternalGetValueByPath(fromObject, []string{"multiSpeakerVoiceConfig"}) + if fromMultiSpeakerVoiceConfig != nil { + fromMultiSpeakerVoiceConfig, err = multiSpeakerVoiceConfigToVertex(fromMultiSpeakerVoiceConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + + InternalSetValueByPath(toObject, []string{"multiSpeakerVoiceConfig"}, fromMultiSpeakerVoiceConfig) + } + + return toObject, nil +} + func toolConfigToMldev(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { toObject = make(map[string]any) @@ -4466,6 +4568,27 @@ func videoToVertex(fromObject map[string]any, parentObject map[string]any, rootO return toObject, nil } +func voiceConfigToVertex(fromObject map[string]any, parentObject map[string]any, rootObject map[string]any) (toObject map[string]any, err error) { + toObject = make(map[string]any) + + fromReplicatedVoiceConfig := InternalGetValueByPath(fromObject, []string{"replicatedVoiceConfig"}) + if fromReplicatedVoiceConfig != nil { + fromReplicatedVoiceConfig, err = replicatedVoiceConfigToVertex(fromReplicatedVoiceConfig.(map[string]any), toObject, rootObject) + if err != nil { + return nil, err + } + + InternalSetValueByPath(toObject, []string{"replicatedVoiceConfig"}, fromReplicatedVoiceConfig) + } + + fromPrebuiltVoiceConfig := InternalGetValueByPath(fromObject, []string{"prebuiltVoiceConfig"}) + if fromPrebuiltVoiceConfig != nil { + InternalSetValueByPath(toObject, []string{"prebuiltVoiceConfig"}, fromPrebuiltVoiceConfig) + } + + return toObject, nil +} + // Models provides methods for interacting with the available language models. // You don't need to initiate this struct. Create a client instance via NewClient, and // then access Models through client.Models field. diff --git a/types.go b/types.go index 90ca3b59..702956a3 100644 --- a/types.go +++ b/types.go @@ -2392,6 +2392,12 @@ type ToolConfig struct { IncludeServerSideToolInvocations *bool `json:"includeServerSideToolInvocations,omitempty"` } +// The signature of the voice consent check. +type VoiceConsentSignature struct { + // Optional. The signature string. + Signature string `json:"signature,omitempty"` +} + // The configuration for the replicated voice to use. type ReplicatedVoiceConfig struct { // Optional. The mimetype of the voice sample. The only currently supported @@ -2400,6 +2406,16 @@ type ReplicatedVoiceConfig struct { MIMEType string `json:"mimeType,omitempty"` // Optional. The sample of the custom voice. VoiceSampleAudio []byte `json:"voiceSampleAudio,omitempty"` + // Optional. Recorded consent verifying ownership of the voice. This + // represents 16-bit signed little-endian wav data, with a 24kHz sampling + // rate. + ConsentAudio []byte `json:"consentAudio,omitempty"` + // Optional. Signature of a previously verified consent audio. This should be + // populated with a signature generated by the server for a previous + // request containing the consent_audio field. When provided, the + // signature is verified instead of the consent_audio field to reduce + // latency. Requests will fail if the signature is invalid or expired. + VoiceConsentSignature *VoiceConsentSignature `json:"voiceConsentSignature,omitempty"` } // Configuration for a prebuilt voice. @@ -7025,6 +7041,11 @@ func (r *ContentReferenceImage) referenceImageAPI() *referenceImageAPI { type LiveServerSetupComplete struct { // Optional. The session ID of the live session. SessionID string `json:"sessionId,omitempty"` + // Optional. Signature of the verified consent audio. This is populated when the + // request has a ReplicatedVoiceConfig with consent_audio set, if the consent + // verification was successful. This may be used in a subsequent request + // instead of the consent_audio to verify the same consent. + VoiceConsentSignature *VoiceConsentSignature `json:"voiceConsentSignature,omitempty"` } // Audio transcription in Server Conent.