@@ -467,9 +467,10 @@ var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
467467
468468// transcribeAudioInMessage resolves audio media refs, transcribes them, and
469469// replaces audio annotations in msg.Content with the transcribed text.
470- func (al * AgentLoop ) transcribeAudioInMessage (ctx context.Context , msg bus.InboundMessage ) bus.InboundMessage {
470+ // Returns the (possibly modified) message and true if audio was transcribed.
471+ func (al * AgentLoop ) transcribeAudioInMessage (ctx context.Context , msg bus.InboundMessage ) (bus.InboundMessage , bool ) {
471472 if al .transcriber == nil || al .mediaStore == nil || len (msg .Media ) == 0 {
472- return msg
473+ return msg , false
473474 }
474475
475476 // Transcribe each audio media ref in order.
@@ -493,9 +494,11 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
493494 }
494495
495496 if len (transcriptions ) == 0 {
496- return msg
497+ return msg , false
497498 }
498499
500+ al .sendTranscriptionFeedback (ctx , msg .Channel , msg .ChatID , msg .MessageID , transcriptions )
501+
499502 // Replace audio annotations sequentially with transcriptions.
500503 idx := 0
501504 newContent := audioAnnotationRe .ReplaceAllStringFunc (msg .Content , func (match string ) string {
@@ -513,7 +516,48 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
513516 }
514517
515518 msg .Content = newContent
516- return msg
519+ return msg , true
520+ }
521+
522+ // sendTranscriptionFeedback sends feedback to the user with the result of
523+ // audio transcription if the option is enabled. It uses Manager.SendMessage
524+ // which executes synchronously (rate limiting, splitting, retry) so that
525+ // ordering with the subsequent placeholder is guaranteed.
526+ func (al * AgentLoop ) sendTranscriptionFeedback (
527+ ctx context.Context ,
528+ channel , chatID , messageID string ,
529+ validTexts []string ,
530+ ) {
531+ if ! al .cfg .Voice .EchoTranscription {
532+ return
533+ }
534+ if al .channelManager == nil {
535+ return
536+ }
537+
538+ var nonEmpty []string
539+ for _ , t := range validTexts {
540+ if t != "" {
541+ nonEmpty = append (nonEmpty , t )
542+ }
543+ }
544+
545+ var feedbackMsg string
546+ if len (nonEmpty ) > 0 {
547+ feedbackMsg = "Transcript: " + strings .Join (nonEmpty , "\n " )
548+ } else {
549+ feedbackMsg = "No voice detected in the audio"
550+ }
551+
552+ err := al .channelManager .SendMessage (ctx , bus.OutboundMessage {
553+ Channel : channel ,
554+ ChatID : chatID ,
555+ Content : feedbackMsg ,
556+ ReplyToMessageID : messageID ,
557+ })
558+ if err != nil {
559+ logger .WarnCF ("voice" , "Failed to send transcription feedback" , map [string ]any {"error" : err .Error ()})
560+ }
517561}
518562
519563// inferMediaType determines the media type ("image", "audio", "video", "file")
@@ -627,7 +671,14 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
627671 },
628672 )
629673
630- msg = al .transcribeAudioInMessage (ctx , msg )
674+ var hadAudio bool
675+ msg , hadAudio = al .transcribeAudioInMessage (ctx , msg )
676+
677+ // For audio messages the placeholder was deferred by the channel.
678+ // Now that transcription (and optional feedback) is done, send it.
679+ if hadAudio && al .channelManager != nil {
680+ al .channelManager .SendPlaceholder (ctx , msg .Channel , msg .ChatID )
681+ }
631682
632683 // Route system messages to processSystemMessage
633684 if msg .Channel == "system" {
0 commit comments