[inworld] prewarm context on llm response start

ianbbqzy · ianbbqzy · commit fa2715119bf7 · 2026-03-12T14:42:56.000-07:00
diff --git a/src/pipecat/services/inworld/tts.py b/src/pipecat/services/inworld/tts.py
@@ -56,6 +56,7 @@
     ErrorFrame,
     Frame,
     InterruptionFrame,
+    LLMFullResponseStartFrame,
     StartFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
@@ -653,6 +654,11 @@ def __init__(
         # Track the end time of the last word in the current generation
         self._generation_end_time = 0.0
 
+        # Context ID that was pre-opened on the server during process_frame
+        # (LLMFullResponseStartFrame) to avoid context creation latency when
+        # the first text arrives.
+        self._prewarmed_context_id: Optional[str] = None
+
         # Init-only config (not runtime-updatable).
         self._audio_encoding = encoding
         self._audio_sample_rate = 0  # Set in start()
@@ -726,6 +732,29 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
             if isinstance(frame, TTSStoppedFrame):
                 await self.add_word_timestamps([("Reset", 0)])
 
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process incoming frames and pre-open context on LLM response start.
+
+        Eagerly sends the context configuration to the server when
+        LLMFullResponseStartFrame arrives, so the context is ready by the time
+        text starts flowing. The base class assigns ``_turn_context_id`` before
+        this runs, which is reused for all ``run_tts`` calls within the turn.
+        """
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            if self._prewarmed_context_id:
+                try:
+                    await self._send_close_context(self._prewarmed_context_id)
+                except Exception as e:
+                    logger.warning(f"{self}: Failed to close previous prewarmed context: {e}")
+                self._prewarmed_context_id = None
+            try:
+                await self._send_context(self._turn_context_id)
+                self._prewarmed_context_id = self._turn_context_id
+            except Exception as e:
+                logger.warning(f"{self}: Failed to pre-open context: {e}")
+
     def _calculate_word_times(self, timestamp_info: Dict[str, Any]) -> List[Tuple[str, float]]:
         """Calculate word timestamps from Inworld WebSocket API response.
 
@@ -887,6 +916,7 @@ async def _disconnect_websocket(self):
         finally:
             await self.remove_active_audio_context()
             self._websocket = None
+            self._prewarmed_context_id = None
             self._cumulative_time = 0.0
             self._generation_end_time = 0.0
             await self._call_event_handler("on_disconnected")
@@ -1001,9 +1031,16 @@ async def _keepalive_task_handler(self):
     async def _send_context(self, context_id: str):
         """Send a context to the Inworld WebSocket TTS service.
 
+        Skips the send if this context was already pre-opened on the server
+        (prewarmed during process_frame).
+
         Args:
             context_id: The context ID.
         """
+        if context_id == self._prewarmed_context_id:
+            self._prewarmed_context_id = None
+            return
+
         audio_config = {
             "audioEncoding": self._audio_encoding,
             "sampleRateHertz": self._audio_sample_rate,