-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Expand file tree
/
Copy pathvoiceModeController.ts
More file actions
108 lines (95 loc) · 3.03 KB
/
voiceModeController.ts
File metadata and controls
108 lines (95 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { debugLogger } from '../utils/debugLogger.js';
import type {
AudioInputProvider,
SpeechToTextAdapter,
TextToSpeechAdapter,
VoiceSessionConfig,
} from './types.js';
import { VoiceState } from './types.js';
/**
* Orchestrates the voice mode lifecycle.
*
* Wires together an AudioInputProvider, SpeechToTextAdapter, and
* TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop.
*
* This is a skeleton — real audio backends will be injected later.
* The controller is intentionally thin so it can be tested without hardware.
*/
export class VoiceModeController {
private state: VoiceState = VoiceState.Idle;
private readonly audioInput: AudioInputProvider;
private readonly stt: SpeechToTextAdapter;
private readonly tts: TextToSpeechAdapter;
private readonly config: VoiceSessionConfig;
constructor(
audioInput: AudioInputProvider,
stt: SpeechToTextAdapter,
tts: TextToSpeechAdapter,
config: VoiceSessionConfig = {},
) {
this.audioInput = audioInput;
this.stt = stt;
this.tts = tts;
this.config = config;
}
/** Current lifecycle state. */
getState(): VoiceState {
return this.state;
}
/**
* Start the voice session.
* Opens the audio input and begins the listen loop.
*/
async start(): Promise<void> {
if (this.state !== VoiceState.Idle) {
debugLogger.warn(
`VoiceModeController.start() called in state "${this.state}", ignoring.`,
);
return;
}
debugLogger.log(
`[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` +
`sampleRate=${String(this.config.sampleRate ?? 16000)})`,
);
this.state = VoiceState.Listening;
await this.audioInput.start(async (chunk) => {
if (this.state !== VoiceState.Listening) return;
try {
this.state = VoiceState.Processing;
const transcript = await this.stt.transcribe(chunk);
if (transcript.trim().length === 0) {
this.state = VoiceState.Listening;
return;
}
debugLogger.log(`[voice] Transcript: "${transcript}"`);
// In the future this is where the transcript feeds into the Gemini
// conversation loop (GeminiClient.sendMessageStream). For now, we
// echo it back through TTS as a proof-of-lifecycle.
this.state = VoiceState.Speaking;
await this.tts.speak(transcript);
} catch (err) {
debugLogger.error('[voice] Error in voice pipeline:', err);
this.state = VoiceState.Error;
} finally {
if (
this.state === VoiceState.Speaking ||
this.state === VoiceState.Processing
) {
this.state = VoiceState.Listening;
}
}
});
}
/** Stop the voice session and release resources. */
async stop(): Promise<void> {
debugLogger.log('[voice] Stopping voice mode.');
await this.tts.cancel();
await this.audioInput.stop();
this.state = VoiceState.Idle;
}
}