Skip to content

Commit a3a0949

Browse files
authored
Merge pull request sipeed#1214 from afjcjsbx/feat/echo-voice-audio-transcription
feat(channel): echo voice audio transcription feedback
2 parents 4ef8e71 + 23cf270 commit a3a0949

11 files changed

Lines changed: 474 additions & 19 deletions

File tree

config/config.example.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,9 @@
477477
"enabled": false,
478478
"monitor_usb": true
479479
},
480+
"voice": {
481+
"echo_transcription": false
482+
},
480483
"gateway": {
481484
"host": "127.0.0.1",
482485
"port": 18790

pkg/agent/loop.go

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -467,9 +467,10 @@ var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
467467

468468
// transcribeAudioInMessage resolves audio media refs, transcribes them, and
469469
// replaces audio annotations in msg.Content with the transcribed text.
470-
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) bus.InboundMessage {
470+
// Returns the (possibly modified) message and true if audio was transcribed.
471+
func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.InboundMessage) (bus.InboundMessage, bool) {
471472
if al.transcriber == nil || al.mediaStore == nil || len(msg.Media) == 0 {
472-
return msg
473+
return msg, false
473474
}
474475

475476
// Transcribe each audio media ref in order.
@@ -493,9 +494,11 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
493494
}
494495

495496
if len(transcriptions) == 0 {
496-
return msg
497+
return msg, false
497498
}
498499

500+
al.sendTranscriptionFeedback(ctx, msg.Channel, msg.ChatID, msg.MessageID, transcriptions)
501+
499502
// Replace audio annotations sequentially with transcriptions.
500503
idx := 0
501504
newContent := audioAnnotationRe.ReplaceAllStringFunc(msg.Content, func(match string) string {
@@ -513,7 +516,48 @@ func (al *AgentLoop) transcribeAudioInMessage(ctx context.Context, msg bus.Inbou
513516
}
514517

515518
msg.Content = newContent
516-
return msg
519+
return msg, true
520+
}
521+
522+
// sendTranscriptionFeedback sends feedback to the user with the result of
523+
// audio transcription if the option is enabled. It uses Manager.SendMessage
524+
// which executes synchronously (rate limiting, splitting, retry) so that
525+
// ordering with the subsequent placeholder is guaranteed.
526+
func (al *AgentLoop) sendTranscriptionFeedback(
527+
ctx context.Context,
528+
channel, chatID, messageID string,
529+
validTexts []string,
530+
) {
531+
if !al.cfg.Voice.EchoTranscription {
532+
return
533+
}
534+
if al.channelManager == nil {
535+
return
536+
}
537+
538+
var nonEmpty []string
539+
for _, t := range validTexts {
540+
if t != "" {
541+
nonEmpty = append(nonEmpty, t)
542+
}
543+
}
544+
545+
var feedbackMsg string
546+
if len(nonEmpty) > 0 {
547+
feedbackMsg = "Transcript: " + strings.Join(nonEmpty, "\n")
548+
} else {
549+
feedbackMsg = "No voice detected in the audio"
550+
}
551+
552+
err := al.channelManager.SendMessage(ctx, bus.OutboundMessage{
553+
Channel: channel,
554+
ChatID: chatID,
555+
Content: feedbackMsg,
556+
ReplyToMessageID: messageID,
557+
})
558+
if err != nil {
559+
logger.WarnCF("voice", "Failed to send transcription feedback", map[string]any{"error": err.Error()})
560+
}
517561
}
518562

519563
// inferMediaType determines the media type ("image", "audio", "video", "file")
@@ -627,7 +671,14 @@ func (al *AgentLoop) processMessage(ctx context.Context, msg bus.InboundMessage)
627671
},
628672
)
629673

630-
msg = al.transcribeAudioInMessage(ctx, msg)
674+
var hadAudio bool
675+
msg, hadAudio = al.transcribeAudioInMessage(ctx, msg)
676+
677+
// For audio messages the placeholder was deferred by the channel.
678+
// Now that transcription (and optional feedback) is done, send it.
679+
if hadAudio && al.channelManager != nil {
680+
al.channelManager.SendPlaceholder(ctx, msg.Channel, msg.ChatID)
681+
}
631682

632683
// Route system messages to processSystemMessage
633684
if msg.Channel == "system" {

pkg/bus/types.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ type InboundMessage struct {
3030
}
3131

3232
type OutboundMessage struct {
33-
Channel string `json:"channel"`
34-
ChatID string `json:"chat_id"`
35-
Content string `json:"content"`
33+
Channel string `json:"channel"`
34+
ChatID string `json:"chat_id"`
35+
Content string `json:"content"`
36+
ReplyToMessageID string `json:"reply_to_message_id,omitempty"`
3637
}
3738

3839
// MediaPart describes a single media attachment to send.

pkg/channels/base.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"crypto/rand"
66
"encoding/binary"
77
"encoding/hex"
8+
"regexp"
89
"strconv"
910
"strings"
1011
"sync/atomic"
@@ -32,6 +33,9 @@ func init() {
3233
uniqueIDPrefix = hex.EncodeToString(b[:])
3334
}
3435

36+
// audioAnnotationRe matches audio/voice annotations injected by channels (e.g. [voice], [audio: file.ogg]).
37+
var audioAnnotationRe = regexp.MustCompile(`\[(voice|audio)(?::[^\]]*)?\]`)
38+
3539
// uniqueID generates a process-unique ID using a random prefix and an atomic counter.
3640
// This ID is intended for internal correlation (e.g. media scope keys) and is NOT
3741
// cryptographically secure — it must not be used in contexts where unpredictability matters.
@@ -284,10 +288,15 @@ func (c *BaseChannel) HandleMessage(
284288
c.placeholderRecorder.RecordReactionUndo(c.name, chatID, undo)
285289
}
286290
}
287-
// Placeholder — independent pipeline
288-
if pc, ok := c.owner.(PlaceholderCapable); ok {
289-
if phID, err := pc.SendPlaceholder(ctx, chatID); err == nil && phID != "" {
290-
c.placeholderRecorder.RecordPlaceholder(c.name, chatID, phID)
291+
// Placeholder — independent pipeline.
292+
// Skip when the message contains audio: the agent will send the
293+
// placeholder after transcription completes, so the user sees
294+
// "Thinking…" only once the voice has been processed.
295+
if !audioAnnotationRe.MatchString(content) {
296+
if pc, ok := c.owner.(PlaceholderCapable); ok {
297+
if phID, err := pc.SendPlaceholder(ctx, chatID); err == nil && phID != "" {
298+
c.placeholderRecorder.RecordPlaceholder(c.name, chatID, phID)
299+
}
291300
}
292301
}
293302
}

pkg/channels/discord/discord.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ func (c *DiscordChannel) Send(ctx context.Context, msg bus.OutboundMessage) erro
134134
return nil
135135
}
136136

137-
return c.sendChunk(ctx, channelID, msg.Content)
137+
return c.sendChunk(ctx, channelID, msg.Content, msg.ReplyToMessageID)
138138
}
139139

140140
// SendMedia implements the channels.MediaSender interface.
@@ -259,14 +259,29 @@ func (c *DiscordChannel) SendPlaceholder(ctx context.Context, chatID string) (st
259259
return msg.ID, nil
260260
}
261261

262-
func (c *DiscordChannel) sendChunk(ctx context.Context, channelID, content string) error {
262+
func (c *DiscordChannel) sendChunk(ctx context.Context, channelID, content, replyToID string) error {
263263
// Use the passed ctx for timeout control
264264
sendCtx, cancel := context.WithTimeout(ctx, sendTimeout)
265265
defer cancel()
266266

267267
done := make(chan error, 1)
268268
go func() {
269-
_, err := c.session.ChannelMessageSend(channelID, content)
269+
var err error
270+
271+
// If we have an ID, we send the message as "Reply"
272+
if replyToID != "" {
273+
_, err = c.session.ChannelMessageSendComplex(channelID, &discordgo.MessageSend{
274+
Content: content,
275+
Reference: &discordgo.MessageReference{
276+
MessageID: replyToID,
277+
ChannelID: channelID,
278+
},
279+
})
280+
} else {
281+
// Otherwise, we send a normal message
282+
_, err = c.session.ChannelMessageSend(channelID, content)
283+
}
284+
270285
done <- err
271286
}()
272287

pkg/channels/manager.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,27 @@ func (m *Manager) RecordPlaceholder(channel, chatID, placeholderID string) {
102102
m.placeholders.Store(key, placeholderEntry{id: placeholderID, createdAt: time.Now()})
103103
}
104104

105+
// SendPlaceholder sends a "Thinking…" placeholder for the given channel/chatID
106+
// and records it for later editing. Returns true if a placeholder was sent.
107+
func (m *Manager) SendPlaceholder(ctx context.Context, channel, chatID string) bool {
108+
m.mu.RLock()
109+
ch, ok := m.channels[channel]
110+
m.mu.RUnlock()
111+
if !ok {
112+
return false
113+
}
114+
pc, ok := ch.(PlaceholderCapable)
115+
if !ok {
116+
return false
117+
}
118+
phID, err := pc.SendPlaceholder(ctx, chatID)
119+
if err != nil || phID == "" {
120+
return false
121+
}
122+
m.RecordPlaceholder(channel, chatID, phID)
123+
return true
124+
}
125+
105126
// RecordTypingStop registers a typing stop function for later invocation.
106127
// Implements PlaceholderRecorder.
107128
func (m *Manager) RecordTypingStop(channel, chatID string, stop func()) {
@@ -813,6 +834,39 @@ func (m *Manager) UnregisterChannel(name string) {
813834
delete(m.channels, name)
814835
}
815836

837+
// SendMessage sends an outbound message synchronously through the channel
838+
// worker's rate limiter and retry logic. It blocks until the message is
839+
// delivered (or all retries are exhausted), which preserves ordering when
840+
// a subsequent operation depends on the message having been sent.
841+
func (m *Manager) SendMessage(ctx context.Context, msg bus.OutboundMessage) error {
842+
m.mu.RLock()
843+
_, exists := m.channels[msg.Channel]
844+
w, wExists := m.workers[msg.Channel]
845+
m.mu.RUnlock()
846+
847+
if !exists {
848+
return fmt.Errorf("channel %s not found", msg.Channel)
849+
}
850+
if !wExists || w == nil {
851+
return fmt.Errorf("channel %s has no active worker", msg.Channel)
852+
}
853+
854+
maxLen := 0
855+
if mlp, ok := w.ch.(MessageLengthProvider); ok {
856+
maxLen = mlp.MaxMessageLength()
857+
}
858+
if maxLen > 0 && len([]rune(msg.Content)) > maxLen {
859+
for _, chunk := range SplitMessage(msg.Content, maxLen) {
860+
chunkMsg := msg
861+
chunkMsg.Content = chunk
862+
m.sendWithRetry(ctx, msg.Channel, w, chunkMsg)
863+
}
864+
} else {
865+
m.sendWithRetry(ctx, msg.Channel, w, msg)
866+
}
867+
return nil
868+
}
869+
816870
func (m *Manager) SendToChannel(ctx context.Context, channelName, chatID, content string) error {
817871
m.mu.RLock()
818872
_, exists := m.channels[channelName]

0 commit comments

Comments
 (0)