openai · bfioca-openai · Oct 8, 2025 · Oct 8, 2025
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -9,11 +9,11 @@
     "lint": "next lint"
   },
   "dependencies": {
-    "@openai/agents": "^0.0.5",
+    "@openai/agents": "^0.1.9",
     "@radix-ui/react-icons": "^1.3.2",
     "dotenv": "^16.4.7",
     "next": "^15.3.1",
-    "openai": "^4.77.3",
+    "openai": "^6.2.0",
     "react": "^19.0.0",
     "react-dom": "^19.0.0",
     "react-markdown": "^9.0.3",

diff --git a/src/app/App.tsx b/src/app/App.tsx
@@ -273,7 +273,12 @@ function App() {
     sendEvent({
       type: 'session.update',
       session: {
-        turn_detection: turnDetection,
+        type: 'realtime',
+        audio: {
+          input: {
+            turn_detection: turnDetection,
+          },
+        },
       },
     });
 
@@ -302,6 +307,7 @@ function App() {
     interrupt();
 
     setIsPTTUserSpeaking(true);
+    mute(false);
     sendClientEvent({ type: 'input_audio_buffer.clear' }, 'clear PTT buffer');
 
     // No placeholder; we'll rely on server transcript once ready.
@@ -314,6 +320,8 @@ function App() {
     setIsPTTUserSpeaking(false);
     sendClientEvent({ type: 'input_audio_buffer.commit' }, 'commit PTT');
     sendClientEvent({ type: 'response.create' }, 'trigger response PTT');
+    sendClientEvent({ type: 'input_audio_buffer.clear' }, 'reset buffer after PTT');
+    mute(true);
   };
 
   const onToggleConnection = () => {
@@ -382,6 +390,16 @@ function App() {
     );
   }, [isAudioPlaybackEnabled]);
 
+  useEffect(() => {
+    if (sessionStatus !== 'CONNECTED') return;
+
+    const shouldMuteMic = isPTTActive
+      ? !isPTTUserSpeaking
+      : !isAudioPlaybackEnabled;
+
+    mute(shouldMuteMic);
+  }, [sessionStatus, isPTTActive, isPTTUserSpeaking, isAudioPlaybackEnabled, mute]);
+
   useEffect(() => {
     if (audioElementRef.current) {
       if (isAudioPlaybackEnabled) {
@@ -395,28 +413,8 @@ function App() {
         audioElementRef.current.pause();
       }
     }
-
-    // Toggle server-side audio stream mute so bandwidth is saved when the
-    // user disables playback. 
-    try {
-      mute(!isAudioPlaybackEnabled);
-    } catch (err) {
-      console.warn('Failed to toggle SDK mute', err);
-    }
   }, [isAudioPlaybackEnabled]);
 
-  // Ensure mute state is propagated to transport right after we connect or
-  // whenever the SDK client reference becomes available.
-  useEffect(() => {
-    if (sessionStatus === 'CONNECTED') {
-      try {
-        mute(!isAudioPlaybackEnabled);
-      } catch (err) {
-        console.warn('mute sync after connect failed', err);
-      }
-    }
-  }, [sessionStatus, isAudioPlaybackEnabled]);
-
   useEffect(() => {
     if (sessionStatus === "CONNECTED" && audioElementRef.current?.srcObject) {
       // The remote audio stream from the audio element.

diff --git a/src/app/agentConfigs/chatSupervisor/index.ts b/src/app/agentConfigs/chatSupervisor/index.ts
@@ -3,7 +3,7 @@ import { getNextResponseFromSupervisor } from './supervisorAgent';
 
 export const chatAgent = new RealtimeAgent({
   name: 'chatAgent',
-  voice: 'sage',
+  voice: 'marin',
   instructions: `
 You are a helpful junior customer service agent. Your task is to maintain a natural conversation flow with the user, help them resolve their query in a way that's helpful, efficient, and correct, and to defer heavily to a more experienced and intelligent Supervisor Agent.
 

diff --git a/src/app/agentConfigs/customerServiceRetail/authentication.ts b/src/app/agentConfigs/customerServiceRetail/authentication.ts
@@ -2,7 +2,7 @@ import { RealtimeAgent, tool } from '@openai/agents/realtime';
 
 export const authenticationAgent = new RealtimeAgent({
   name: 'authentication',
-  voice: 'sage',  
+  voice: 'marin',
   handoffDescription:
     'The initial agent that greets the user, does authentication and routes them to the correct downstream agent.',
 

diff --git a/src/app/agentConfigs/customerServiceRetail/returns.ts b/src/app/agentConfigs/customerServiceRetail/returns.ts
@@ -2,7 +2,7 @@ import { RealtimeAgent, tool, RealtimeItem } from '@openai/agents/realtime';
 
 export const returnsAgent = new RealtimeAgent({
   name: 'returns',
-  voice: 'sage',
+  voice: 'marin',
   handoffDescription:
     'Customer Service Agent specialized in order lookups, policy checks, and return initiations.',
 

diff --git a/src/app/agentConfigs/customerServiceRetail/sales.ts b/src/app/agentConfigs/customerServiceRetail/sales.ts
@@ -2,7 +2,7 @@ import { RealtimeAgent, tool } from '@openai/agents/realtime';
 
 export const salesAgent = new RealtimeAgent({
   name: 'salesAgent',
-  voice: 'sage',
+  voice: 'marin',
   handoffDescription:
     "Handles sales-related inquiries, including new product details, recommendations, promotions, and purchase flows. Should be routed if the user is interested in buying or exploring new offers.",
 

diff --git a/src/app/agentConfigs/customerServiceRetail/simulatedHuman.ts b/src/app/agentConfigs/customerServiceRetail/simulatedHuman.ts
@@ -2,7 +2,7 @@ import { RealtimeAgent } from '@openai/agents/realtime';
 
 export const simulatedHumanAgent = new RealtimeAgent({
   name: 'simulatedHuman',
-  voice: 'sage',
+  voice: 'marin',
   handoffDescription:
     'Placeholder, simulated human agent that can provide more advanced help to the user. Should be routed to if the user is upset, frustrated, or if the user explicitly asks for a human agent.',
   instructions:

diff --git a/src/app/agentConfigs/simpleHandoff.ts b/src/app/agentConfigs/simpleHandoff.ts
@@ -4,7 +4,7 @@ import {
 
 export const haikuWriterAgent = new RealtimeAgent({
   name: 'haikuWriter',
-  voice: 'sage',
+  voice: 'marin',
   instructions:
     'Ask the user for a topic, then reply with a haiku about that topic.',
   handoffs: [],
@@ -14,7 +14,7 @@ export const haikuWriterAgent = new RealtimeAgent({
 
 export const greeterAgent = new RealtimeAgent({
   name: 'greeter',
-  voice: 'sage',
+  voice: 'marin',
   instructions:
     "Please greet the user and ask them if they'd like a Haiku. If yes, hand off to the 'haiku' agent.",
   handoffs: [haikuWriterAgent],

diff --git a/src/app/api/session/route.ts b/src/app/api/session/route.ts
@@ -1,22 +1,37 @@
 import { NextResponse } from "next/server";
+import OpenAI from "openai";
+
+const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
 
 export async function GET() {
   try {
-    const response = await fetch(
-      "https://api.openai.com/v1/realtime/sessions",
-      {
-        method: "POST",
-        headers: {
-          Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
-          "Content-Type": "application/json",
+    const data = await openai.realtime.clientSecrets.create({
+      session: {
+        type: "realtime",
+        model: "gpt-realtime",
+        output_modalities: ["audio"],
+        audio: {
+          input: {
+            format: { type: "audio/pcm", rate: 24000 },
+            transcription: {
+              model: "gpt-4o-mini-transcribe",
+            },
+          },
+          output: {
+            format: { type: "audio/pcm", rate: 24000 },
+            voice: "marin",
+          },
         },
-        body: JSON.stringify({
-          model: "gpt-4o-realtime-preview-2025-06-03",
-        }),
-      }
-    );
-    const data = await response.json();
-    return NextResponse.json(data);
+      },
+    });
+
+    return NextResponse.json({
+      client_secret: {
+        value: data.value,
+        expires_at: data.expires_at,
+      },
+      session: data.session,
+    });
   } catch (error) {
     console.error("Error in /session:", error);
     return NextResponse.json(

diff --git a/src/app/hooks/useRealtimeSession.ts b/src/app/hooks/useRealtimeSession.ts
@@ -50,10 +50,12 @@ export function useRealtimeSession(callbacks: RealtimeSessionCallbacks = {}) {
         historyHandlers.handleTranscriptionCompleted(event);
         break;
       }
+      case "response.output_audio_transcript.done":
       case "response.audio_transcript.done": {
         historyHandlers.handleTranscriptionCompleted(event);
         break;
       }
+      case "response.output_audio_transcript.delta":
       case "response.audio_transcript.delta": {
         historyHandlers.handleTranscriptionDelta(event);
         break;
@@ -137,12 +139,19 @@ export function useRealtimeSession(callbacks: RealtimeSessionCallbacks = {}) {
             return pc;
           },
         }),
-        model: 'gpt-4o-realtime-preview-2025-06-03',
+        model: 'gpt-realtime',
         config: {
-          inputAudioFormat: audioFormat,
-          outputAudioFormat: audioFormat,
-          inputAudioTranscription: {
-            model: 'gpt-4o-mini-transcribe',
+          outputModalities: ['audio'],
+          audio: {
+            input: {
+              format: audioFormat,
+              transcription: {
+                model: 'gpt-4o-mini-transcribe',
+              },
+            },
+            output: {
+              format: audioFormat,
+            },
           },
         },
         outputGuardrails: outputGuardrails ?? [],

diff --git a/src/app/lib/codecUtils.ts b/src/app/lib/codecUtils.ts
@@ -1,11 +1,18 @@
-export function audioFormatForCodec(codec: string): 'pcm16' | 'g711_ulaw' | 'g711_alaw' {
-  let audioFormat: 'pcm16' | 'g711_ulaw' | 'g711_alaw' = 'pcm16';
-  if (typeof window !== 'undefined') {
-    const c = codec.toLowerCase();
-    if (c === 'pcmu') audioFormat = 'g711_ulaw';
-    else if (c === 'pcma') audioFormat = 'g711_alaw';
+import type { RealtimeAudioFormat } from '@openai/agents/realtime';
+
+export function audioFormatForCodec(codec: string): RealtimeAudioFormat {
+  const normalized = typeof codec === 'string' ? codec.toLowerCase() : 'opus';
+
+  if (normalized === 'pcmu') {
+    return { type: 'audio/pcmu' };
   }
-  return audioFormat;
+
+  if (normalized === 'pcma') {
+    return { type: 'audio/pcma' };
+  }
+
+  // Default to wideband PCM for Opus or any other codec
+  return { type: 'audio/pcm', rate: 24000 };
 }
 
 // Apply preferred codec on a peer connection's audio transceivers. Safe to call multiple times.