Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/4013.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Added context prewarming path for `InworldTTSService` to improve first audio latency
37 changes: 37 additions & 0 deletions src/pipecat/services/inworld/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
ErrorFrame,
Frame,
InterruptionFrame,
LLMFullResponseStartFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
Expand Down Expand Up @@ -653,6 +654,11 @@ def __init__(
# Track the end time of the last word in the current generation
self._generation_end_time = 0.0

# Context ID that was pre-opened on the server during process_frame
# (LLMFullResponseStartFrame) to avoid context creation latency when
# enough context for TTS is available.
self._prewarmed_context_id: Optional[str] = None

# Init-only config (not runtime-updatable).
self._audio_encoding = encoding
self._audio_sample_rate = 0 # Set in start()
Expand Down Expand Up @@ -726,6 +732,29 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("Reset", 0)])

async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and pre-open context on LLM response start.

Eagerly sends the context configuration to the server when
LLMFullResponseStartFrame arrives, so the context is ready by the time
enough context for TTS is available. The base class assigns ``_turn_context_id`` before
this runs, which is reused for all ``run_tts`` calls within the turn.
"""
await super().process_frame(frame, direction)

if isinstance(frame, LLMFullResponseStartFrame):
if self._prewarmed_context_id:
try:
await self._send_close_context(self._prewarmed_context_id)
except Exception as e:
logger.warning(f"{self}: Failed to close previous prewarmed context: {e}")
self._prewarmed_context_id = None
try:
await self._send_context(self._turn_context_id)
self._prewarmed_context_id = self._turn_context_id
except Exception as e:
logger.warning(f"{self}: Failed to pre-open context: {e}")

def _calculate_word_times(self, timestamp_info: Dict[str, Any]) -> List[Tuple[str, float]]:
"""Calculate word timestamps from Inworld WebSocket API response.

Expand Down Expand Up @@ -887,6 +916,7 @@ async def _disconnect_websocket(self):
finally:
await self.remove_active_audio_context()
self._websocket = None
self._prewarmed_context_id = None
self._cumulative_time = 0.0
self._generation_end_time = 0.0
await self._call_event_handler("on_disconnected")
Expand Down Expand Up @@ -1001,9 +1031,16 @@ async def _keepalive_task_handler(self):
async def _send_context(self, context_id: str):
"""Send a context to the Inworld WebSocket TTS service.

Skips the send if this context was already pre-opened on the server
(prewarmed during process_frame).

Args:
context_id: The context ID.
"""
if context_id == self._prewarmed_context_id:
self._prewarmed_context_id = None
return

audio_config = {
"audioEncoding": self._audio_encoding,
"sampleRateHertz": self._audio_sample_rate,
Expand Down
Loading