GoogleCloudPlatform · Guiners · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
@@ -0,0 +1,88 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile
+
+import asyncio
+
+
+async def generate_content() -> list[str]:
+    # [START googlegenaisdk_live_audio_with_txt]
+    import numpy as np
+    from IPython.display import Audio, Markdown, display
+    from google import genai
+    from google.genai.types import (
+        Content,
+        LiveConnectConfig,
+        Modality,
+        Part,
+        SpeechConfig,
+        VoiceConfig,
+        PrebuiltVoiceConfig,
+    )
+
+    client = genai.Client()
+    voice_name = "Aoede"
+    model = "gemini-2.0-flash-live-preview-04-09"
+
+    config = LiveConnectConfig(
+        response_modalities=[Modality.AUDIO],
+        speech_config=SpeechConfig(
+            voice_config=VoiceConfig(
+                prebuilt_voice_config=PrebuiltVoiceConfig(
+                    voice_name=voice_name,
+                )
+            ),
+        ),
+    )
+
+    async with client.aio.live.connect(
+        model=model,
+        config=config,
+    ) as session:
+        text_input = "Hello? Gemini are you there?"
+        print("> ", text_input, "\n")
+
+        await session.send_client_content(
+            turns=Content(role="user", parts=[Part(text=text_input)])
+        )
+
+        audio_data = []
+        async for message in session.receive():
+            if (
+                message.server_content.model_turn
+                and message.server_content.model_turn.parts
+            ):
+                for part in message.server_content.model_turn.parts:
+                    if part.inline_data:
+                        audio_data.append(
+                            np.frombuffer(part.inline_data.data, dtype=np.int16)
+                        )
+
+        if audio_data:
+            print("Received audio answer: ")
+            display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))
+
+    # Example output:
+    # >  Hello? Gemini are you there?
+    # Received audio answer:
+    # <IPython.lib.display.Audio object>
+    # [END googlegenaisdk_live_audio_with_txt]
+    return []
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -0,0 +1,71 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile
+
+import asyncio
+
+
+async def generate_content() -> list[str]:
+    # [START googlegenaisdk_live_txt_with_audio]
+    import io
+    import requests
+    from google import genai
+    from google.genai.types import Modality, LiveConnectConfig, Blob
+    import soundfile as sf
+    import librosa
+
+    client = genai.Client()
+    model = "gemini-2.0-flash-live-preview-04-09"
+    config = LiveConnectConfig(response_modalities=[Modality.TEXT])
+
+    async with client.aio.live.connect(model=model, config=config) as session:
+        audio_url = (
+            "https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
+        )
+        response = requests.get(audio_url)
+        response.raise_for_status()
+        buffer = io.BytesIO(response.content)
+        y, sr = librosa.load(buffer, sr=16000)
+        sf.write(buffer, y, sr, format="RAW", subtype="PCM_16")
+        buffer.seek(0)
+        audio_bytes = buffer.read()
+
+        # If you've pre-converted to sample.pcm using ffmpeg, use this instead:
+        # audio_bytes = Path("sample.pcm").read_bytes()
+
+        print("> Answer to this audio url", audio_url, "\n")
+
+        await session.send_realtime_input(
+            media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
+        )
+
+        response = []
+
+        async for message in session.receive():
+            if message.text is not None:
+                response.append(message.text)
+
+        print("".join(response))
+    # Example output:
+    # > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+    # Yes, I can hear you. How can I help you today?
+    # [END googlegenaisdk_live_txt_with_audio]
+    return response
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -20,7 +20,9 @@ def get_bearer_token() -> str:
     import google.auth
     from google.auth.transport.requests import Request
 
-    creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+    creds, _ = google.auth.default(
+        scopes=["https://www.googleapis.com/auth/cloud-platform"]
+    )
     auth_req = Request()
     creds.refresh(auth_req)
     bearer_token = creds.token
@@ -55,9 +57,7 @@ async def generate_content() -> str:
 
     # Websocket Configuration
     WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com"
-    WEBSOCKET_SERVICE_URL = (
-        f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
-    )
+    WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
 
     # Websocket Authentication
     headers = {
@@ -66,9 +66,7 @@ async def generate_content() -> str:
     }
 
     # Model Configuration
-    model_path = (
-        f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
-    )
+    model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
     model_generation_config = {
         "response_modalities": ["AUDIO"],
         "speech_config": {
@@ -77,7 +75,9 @@ async def generate_content() -> str:
         },
     }
 
-    async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session:
+    async with connect(
+        WEBSOCKET_SERVICE_URL, additional_headers=headers
+    ) as websocket_session:
         # 1. Send setup configuration
         websocket_config = {
             "setup": {
@@ -120,7 +120,9 @@ async def generate_content() -> str:
             server_content = response_chunk.get("serverContent")
             if not server_content:
                 # This might indicate an error or an unexpected message format
-                print(f"Received non-serverContent message or empty content: {response_chunk}")
+                print(
+                    f"Received non-serverContent message or empty content: {response_chunk}"
+                )
                 break
 
             # Collect audio chunks
@@ -129,15 +131,19 @@ async def generate_content() -> str:
                 for part in model_turn["parts"]:
                     if part["inlineData"]["mimeType"] == "audio/pcm":
                         audio_chunk = base64.b64decode(part["inlineData"]["data"])
-                        aggregated_response_parts.append(np.frombuffer(audio_chunk, dtype=np.int16))
+                        aggregated_response_parts.append(
+                            np.frombuffer(audio_chunk, dtype=np.int16)
+                        )
 
             # End of response
             if server_content.get("turnComplete"):
                 break
 
         # Save audio to a file
         if aggregated_response_parts:
-            wavfile.write("output.wav", 24000, np.concatenate(aggregated_response_parts))
+            wavfile.write(
+                "output.wav", 24000, np.concatenate(aggregated_response_parts)
+            )
         # Example response:
         #     Setup Response: {'setupComplete': {}}
         #     Input: Hello? Gemini are you there?

@@ -20,7 +20,9 @@ def get_bearer_token() -> str:
     import google.auth
     from google.auth.transport.requests import Request
 
-    creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+    creds, _ = google.auth.default(
+        scopes=["https://www.googleapis.com/auth/cloud-platform"]
+    )
     auth_req = Request()
     creds.refresh(auth_req)
     bearer_token = creds.token
@@ -55,9 +57,7 @@ async def generate_content() -> str:
 
     # Websocket Configuration
     WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com"
-    WEBSOCKET_SERVICE_URL = (
-        f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
-    )
+    WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
 
     # Websocket Authentication
     headers = {
@@ -66,9 +66,7 @@ async def generate_content() -> str:
     }
 
     # Model Configuration
-    model_path = (
-        f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
-    )
+    model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
     model_generation_config = {
         "response_modalities": ["AUDIO"],
         "speech_config": {
@@ -77,7 +75,9 @@ async def generate_content() -> str:
         },
     }
 
-    async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session:
+    async with connect(
+        WEBSOCKET_SERVICE_URL, additional_headers=headers
+    ) as websocket_session:
         # 1. Send setup configuration
         websocket_config = {
             "setup": {
@@ -125,7 +125,9 @@ async def generate_content() -> str:
             server_content = response_chunk.get("serverContent")
             if not server_content:
                 # This might indicate an error or an unexpected message format
-                print(f"Received non-serverContent message or empty content: {response_chunk}")
+                print(
+                    f"Received non-serverContent message or empty content: {response_chunk}"
+                )
                 break
 
             # Transcriptions
@@ -142,7 +144,9 @@ async def generate_content() -> str:
                 for part in model_turn["parts"]:
                     if part["inlineData"]["mimeType"] == "audio/pcm":
                         audio_chunk = base64.b64decode(part["inlineData"]["data"])
-                        aggregated_response_parts.append(np.frombuffer(audio_chunk, dtype=np.int16))
+                        aggregated_response_parts.append(
+                            np.frombuffer(audio_chunk, dtype=np.int16)
+                        )
 
             # End of response
             if server_content.get("turnComplete"):

@@ -20,7 +20,9 @@ def get_bearer_token() -> str:
     import google.auth
     from google.auth.transport.requests import Request
 
-    creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+    creds, _ = google.auth.default(
+        scopes=["https://www.googleapis.com/auth/cloud-platform"]
+    )
     auth_req = Request()
     creds.refresh(auth_req)
     bearer_token = creds.token
@@ -65,9 +67,7 @@ def read_wavefile(filepath: str) -> tuple[str, str]:
 
     # Websocket Configuration
     WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com"
-    WEBSOCKET_SERVICE_URL = (
-        f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
-    )
+    WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
 
     # Websocket Authentication
     headers = {
@@ -76,12 +76,12 @@ def read_wavefile(filepath: str) -> tuple[str, str]:
     }
 
     # Model Configuration
-    model_path = (
-        f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
-    )
+    model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
     model_generation_config = {"response_modalities": ["TEXT"]}
 
-    async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session:
+    async with connect(
+        WEBSOCKET_SERVICE_URL, additional_headers=headers
+    ) as websocket_session:
         # 1. Send setup configuration
         websocket_config = {
             "setup": {
@@ -105,7 +105,9 @@ def read_wavefile(filepath: str) -> tuple[str, str]:
             return "Error: WebSocket setup failed."
 
         # 3. Send audio message
-        encoded_audio_message, mime_type = read_wavefile("hello_gemini_are_you_there.wav")
+        encoded_audio_message, mime_type = read_wavefile(
+            "hello_gemini_are_you_there.wav"
+        )
         # Example audio message:  "Hello? Gemini are you there?"
 
         user_message = {
@@ -136,7 +138,9 @@ def read_wavefile(filepath: str) -> tuple[str, str]:
             server_content = response_chunk.get("serverContent")
             if not server_content:
                 # This might indicate an error or an unexpected message format
-                print(f"Received non-serverContent message or empty content: {response_chunk}")
+                print(
+                    f"Received non-serverContent message or empty content: {response_chunk}"
+                )
                 break
 
             # Collect text responses