@@ -235,6 +235,14 @@ export class AgentActivity implements RecognitionHooks {
235235 } catch ( error ) {
236236 this . logger . error ( error , 'failed to update the tools' ) ;
237237 }
238+
239+ if ( ! this . llm . capabilities . audioOutput && ! this . tts && this . agentSession . output . audio ) {
240+ this . logger . error (
241+ 'audio output is enabled but RealtimeModel has no audio modality ' +
242+ 'and no TTS is set. Either enable audio modality in the RealtimeModel ' +
243+ 'or set a TTS model.' ,
244+ ) ;
245+ }
238246 } else if ( this . llm instanceof LLM ) {
239247 try {
240248 updateInstructions ( {
@@ -1633,7 +1641,7 @@ export class AgentActivity implements RecognitionHooks {
16331641
16341642 const readMessages = async (
16351643 abortController : AbortController ,
1636- outputs : Array < [ string , _TextOut | null , _AudioOut | null ] > ,
1644+ outputs : Array < [ string , _TextOut | null , _AudioOut | null , ( 'text' | 'audio' ) [ ] | undefined ] > ,
16371645 ) => {
16381646 replyAbortController . signal . addEventListener ( 'abort' , ( ) => abortController . abort ( ) , {
16391647 once : true ,
@@ -1648,7 +1656,25 @@ export class AgentActivity implements RecognitionHooks {
16481656 ) ;
16491657 break ;
16501658 }
1651- const trNodeResult = await this . agent . transcriptionNode ( msg . textStream , modelSettings ) ;
1659+
1660+ const msgModalities = msg . modalities ? await msg . modalities : undefined ;
1661+ let ttsTextInput : ReadableStream < string > | null = null ;
1662+ let trTextInput : ReadableStream < string > ;
1663+
1664+ if ( msgModalities && ! msgModalities . includes ( 'audio' ) && this . tts ) {
1665+ if ( this . llm instanceof RealtimeModel && this . llm . capabilities . audioOutput ) {
1666+ this . logger . warn (
1667+ 'text response received from realtime API, falling back to use a TTS model.' ,
1668+ ) ;
1669+ }
1670+ const [ _ttsTextInput , _trTextInput ] = msg . textStream . tee ( ) ;
1671+ ttsTextInput = _ttsTextInput ;
1672+ trTextInput = _trTextInput ;
1673+ } else {
1674+ trTextInput = msg . textStream ;
1675+ }
1676+
1677+ const trNodeResult = await this . agent . transcriptionNode ( trTextInput , modelSettings ) ;
16521678 let textOut : _TextOut | null = null ;
16531679 if ( trNodeResult ) {
16541680 const [ textForwardTask , _textOut ] = performTextForwarding (
@@ -1659,30 +1685,51 @@ export class AgentActivity implements RecognitionHooks {
16591685 forwardTasks . push ( textForwardTask ) ;
16601686 textOut = _textOut ;
16611687 }
1688+
16621689 let audioOut : _AudioOut | null = null ;
16631690 if ( audioOutput ) {
1664- const realtimeAudio = await this . agent . realtimeAudioOutputNode (
1665- msg . audioStream ,
1666- modelSettings ,
1667- ) ;
1668- if ( realtimeAudio ) {
1691+ let realtimeAudioResult : ReadableStream < AudioFrame > | null = null ;
1692+
1693+ if ( ttsTextInput ) {
1694+ const [ ttsTask , ttsStream ] = performTTSInference (
1695+ ( ...args ) => this . agent . ttsNode ( ...args ) ,
1696+ ttsTextInput ,
1697+ modelSettings ,
1698+ abortController ,
1699+ ) ;
1700+ tasks . push ( ttsTask ) ;
1701+ realtimeAudioResult = ttsStream ;
1702+ } else if ( msgModalities && msgModalities . includes ( 'audio' ) ) {
1703+ realtimeAudioResult = await this . agent . realtimeAudioOutputNode (
1704+ msg . audioStream ,
1705+ modelSettings ,
1706+ ) ;
1707+ } else if ( this . llm instanceof RealtimeModel && this . llm . capabilities . audioOutput ) {
1708+ this . logger . error (
1709+ 'Text message received from Realtime API with audio modality. ' +
1710+ 'This usually happens when text chat context is synced to the API. ' +
1711+ 'Try to add a TTS model as fallback or use text modality with TTS instead.' ,
1712+ ) ;
1713+ } else {
1714+ this . logger . warn (
1715+ 'audio output is enabled but neither tts nor realtime audio is available' ,
1716+ ) ;
1717+ }
1718+
1719+ if ( realtimeAudioResult ) {
16691720 const [ forwardTask , _audioOut ] = performAudioForwarding (
1670- realtimeAudio ,
1721+ realtimeAudioResult ,
16711722 audioOutput ,
16721723 abortController ,
16731724 ) ;
16741725 forwardTasks . push ( forwardTask ) ;
16751726 audioOut = _audioOut ;
16761727 audioOut . firstFrameFut . await . finally ( onFirstFrame ) ;
1677- } else {
1678- this . logger . warn (
1679- 'audio output is enabled but neither tts nor realtime audio is available' ,
1680- ) ;
16811728 }
16821729 } else if ( textOut ) {
16831730 textOut . firstTextFut . await . finally ( onFirstFrame ) ;
16841731 }
1685- outputs . push ( [ msg . messageId , textOut , audioOut ] ) ;
1732+ outputs . push ( [ msg . messageId , textOut , audioOut , msgModalities ] ) ;
16861733 }
16871734 await waitFor ( forwardTasks ) ;
16881735 } catch ( error ) {
@@ -1692,7 +1739,9 @@ export class AgentActivity implements RecognitionHooks {
16921739 }
16931740 } ;
16941741
1695- const messageOutputs : Array < [ string , _TextOut | null , _AudioOut | null ] > = [ ] ;
1742+ const messageOutputs : Array <
1743+ [ string , _TextOut | null , _AudioOut | null , ( 'text' | 'audio' ) [ ] | undefined ]
1744+ > = [ ] ;
16961745 const tasks = [
16971746 Task . from (
16981747 ( controller ) => readMessages ( controller , messageOutputs ) ,
@@ -1771,7 +1820,7 @@ export class AgentActivity implements RecognitionHooks {
17711820
17721821 if ( messageOutputs . length > 0 ) {
17731822 // there should be only one message
1774- const [ msgId , textOut , audioOut ] = messageOutputs [ 0 ] ! ;
1823+ const [ msgId , textOut , audioOut , msgModalities ] = messageOutputs [ 0 ] ! ;
17751824 let forwardedText = textOut ?. text || '' ;
17761825
17771826 if ( audioOutput ) {
@@ -1796,6 +1845,8 @@ export class AgentActivity implements RecognitionHooks {
17961845 this . realtimeSession . truncate ( {
17971846 messageId : msgId ,
17981847 audioEndMs : Math . floor ( playbackPosition ) ,
1848+ modalities : msgModalities ,
1849+ audioTranscript : forwardedText ,
17991850 } ) ;
18001851 }
18011852
@@ -1826,7 +1877,7 @@ export class AgentActivity implements RecognitionHooks {
18261877
18271878 if ( messageOutputs . length > 0 ) {
18281879 // there should be only one message
1829- const [ msgId , textOut , _ ] = messageOutputs [ 0 ] ! ;
1880+ const [ msgId , textOut , _ , __ ] = messageOutputs [ 0 ] ! ;
18301881 const message = ChatMessage . create ( {
18311882 role : 'assistant' ,
18321883 content : textOut ?. text || '' ,
0 commit comments