Skip to content

Commit 2c0f7c2

Browse files
authored
fix(twilio): add configurable startup delay to avoid initial audio jitter (fixes #1906) (#2033)
1 parent 6a5d9ce commit 2c0f7c2

File tree

1 file changed

+55
-21
lines changed

1 file changed

+55
-21
lines changed

examples/realtime/twilio/twilio_handler.py

Lines changed: 55 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@ def get_current_time() -> str:
3434

3535
agent = RealtimeAgent(
3636
name="Twilio Assistant",
37-
instructions="You are a helpful assistant that starts every conversation with a creative greeting. Keep responses concise and friendly since this is a phone conversation.",
37+
instructions=(
38+
"You are a helpful assistant that starts every conversation with a creative greeting. "
39+
"Keep responses concise and friendly since this is a phone conversation."
40+
),
3841
tools=[get_weather, get_current_time],
3942
)
4043

@@ -46,21 +49,39 @@ def __init__(self, twilio_websocket: WebSocket):
4649
self.session: RealtimeSession | None = None
4750
self.playback_tracker = RealtimePlaybackTracker()
4851

49-
# Audio buffering configuration (matching CLI demo)
50-
self.CHUNK_LENGTH_S = 0.05 # 50ms chunks like CLI demo
51-
self.SAMPLE_RATE = 8000 # Twilio uses 8kHz for g711_ulaw
52-
self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S) # 50ms worth of audio
52+
# Audio chunking (matches CLI demo)
53+
self.CHUNK_LENGTH_S = 0.05 # 50ms chunks
54+
self.SAMPLE_RATE = 8000 # Twilio g711_ulaw at 8kHz
55+
self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S) # ~400 bytes per 50ms
5356

5457
self._stream_sid: str | None = None
5558
self._audio_buffer: bytearray = bytearray()
5659
self._last_buffer_send_time = time.time()
5760

58-
# Mark event tracking for playback
61+
# Playback tracking for outbound audio
5962
self._mark_counter = 0
6063
self._mark_data: dict[
6164
str, tuple[str, int, int]
6265
] = {} # mark_id -> (item_id, content_index, byte_count)
6366

67+
# ---- Deterministic startup warm-up (preferred over sleep) ----
68+
# Buffer the first N chunks before sending to OpenAI; then mark warmed.
69+
try:
70+
self.STARTUP_BUFFER_CHUNKS = max(0, int(os.getenv("TWILIO_STARTUP_BUFFER_CHUNKS", "3")))
71+
except Exception:
72+
self.STARTUP_BUFFER_CHUNKS = 3
73+
74+
self._startup_buffer = bytearray()
75+
self._startup_warmed = (
76+
self.STARTUP_BUFFER_CHUNKS == 0
77+
) # if 0, considered warmed immediately
78+
79+
# Optional delay (defaults 0.0 because buffering is preferred)
80+
try:
81+
self.STARTUP_DELAY_S = float(os.getenv("TWILIO_STARTUP_DELAY_S", "0.0"))
82+
except Exception:
83+
self.STARTUP_DELAY_S = 0.0
84+
6485
async def start(self) -> None:
6586
"""Start the session."""
6687
runner = RealtimeRunner(agent)
@@ -89,6 +110,11 @@ async def start(self) -> None:
89110
await self.twilio_websocket.accept()
90111
print("Twilio WebSocket connection accepted")
91112

113+
# Optional tiny delay (kept configurable; default 0.0)
114+
if self.STARTUP_DELAY_S > 0:
115+
await asyncio.sleep(self.STARTUP_DELAY_S)
116+
117+
# Start loops after handshake
92118
self._realtime_session_task = asyncio.create_task(self._realtime_session_loop())
93119
self._message_loop_task = asyncio.create_task(self._twilio_message_loop())
94120
self._buffer_flush_task = asyncio.create_task(self._buffer_flush_loop())
@@ -197,7 +223,7 @@ async def _handle_media_event(self, message: dict[str, Any]) -> None:
197223
# Add original µ-law to buffer for OpenAI (they expect µ-law)
198224
self._audio_buffer.extend(ulaw_bytes)
199225

200-
# Send buffered audio if we have enough data
226+
# Send buffered audio if we have enough data for one chunk
201227
if len(self._audio_buffer) >= self.BUFFER_SIZE_BYTES:
202228
await self._flush_audio_buffer()
203229

@@ -210,47 +236,55 @@ async def _handle_mark_event(self, message: dict[str, Any]) -> None:
210236
mark_data = message.get("mark", {})
211237
mark_id = mark_data.get("name", "")
212238

213-
# Look up stored data for this mark ID
214239
if mark_id in self._mark_data:
215240
item_id, item_content_index, byte_count = self._mark_data[mark_id]
216-
217-
# Convert byte count back to bytes for playback tracker
218-
audio_bytes = b"\x00" * byte_count # Placeholder bytes
219-
220-
# Update playback tracker
241+
audio_bytes = b"\x00" * byte_count # Placeholder bytes for tracker
221242
self.playback_tracker.on_play_bytes(item_id, item_content_index, audio_bytes)
222243
print(
223244
f"Playback tracker updated: {item_id}, index {item_content_index}, {byte_count} bytes"
224245
)
225-
226-
# Clean up the stored data
227246
del self._mark_data[mark_id]
228247

229248
except Exception as e:
230249
print(f"Error handling mark event: {e}")
231250

232251
async def _flush_audio_buffer(self) -> None:
233-
"""Send buffered audio to OpenAI."""
252+
"""Send buffered audio to OpenAI with deterministic startup warm-up."""
234253
if not self._audio_buffer or not self.session:
235254
return
236255

237256
try:
238-
# Send the buffered audio
239257
buffer_data = bytes(self._audio_buffer)
240-
await self.session.send_audio(buffer_data)
241-
242-
# Clear the buffer
243258
self._audio_buffer.clear()
244259
self._last_buffer_send_time = time.time()
245260

261+
# During startup, accumulate first N chunks before sending anything
262+
if not self._startup_warmed:
263+
self._startup_buffer.extend(buffer_data)
264+
265+
# target bytes = N chunks * bytes-per-chunk
266+
target_bytes = self.BUFFER_SIZE_BYTES * max(0, self.STARTUP_BUFFER_CHUNKS)
267+
268+
if len(self._startup_buffer) >= target_bytes:
269+
# Warm-up complete: flush all buffered data in order
270+
await self.session.send_audio(bytes(self._startup_buffer))
271+
self._startup_buffer.clear()
272+
self._startup_warmed = True
273+
else:
274+
# Not enough yet; keep buffering and return
275+
return
276+
else:
277+
# Already warmed: send immediately
278+
await self.session.send_audio(buffer_data)
279+
246280
except Exception as e:
247281
print(f"Error sending buffered audio to OpenAI: {e}")
248282

249283
async def _buffer_flush_loop(self) -> None:
250284
"""Periodically flush audio buffer to prevent stale data."""
251285
try:
252286
while True:
253-
await asyncio.sleep(self.CHUNK_LENGTH_S) # Check every 50ms
287+
await asyncio.sleep(self.CHUNK_LENGTH_S) # check every 50ms
254288

255289
# If buffer has data and it's been too long since last send, flush it
256290
current_time = time.time()

0 commit comments

Comments
 (0)