gemini-cli/packages/core/src/voice/voiceModeController.ts at 5e313f9c75e0199172ffe636dee4fc5ea0a157b2 · google-gemini/gemini-cli · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/**
 * @license
 * Copyright 2026 Google LLC
 * SPDX-License-Identifier: Apache-2.0
 */

import { debugLogger } from '../utils/debugLogger.js';
import type {
  AudioInputProvider,
  SpeechToTextAdapter,
  TextToSpeechAdapter,
  VoiceSessionConfig,
} from './types.js';
import { VoiceState } from './types.js';

/**
 * Orchestrates the voice mode lifecycle.
 *
 * Wires together an AudioInputProvider, SpeechToTextAdapter, and
 * TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop.
 *
 * This is a skeleton — real audio backends will be injected later.
 * The controller is intentionally thin so it can be tested without hardware.
 */
export class VoiceModeController {
  private state: VoiceState = VoiceState.Idle;
  private readonly audioInput: AudioInputProvider;
  private readonly stt: SpeechToTextAdapter;
  private readonly tts: TextToSpeechAdapter;
  private readonly config: VoiceSessionConfig;

  constructor(
    audioInput: AudioInputProvider,
    stt: SpeechToTextAdapter,
    tts: TextToSpeechAdapter,
    config: VoiceSessionConfig = {},
  ) {
    this.audioInput = audioInput;
    this.stt = stt;
    this.tts = tts;
    this.config = config;
  }

  /** Current lifecycle state. */
  getState(): VoiceState {
    return this.state;
  }

  /**
   * Start the voice session.
   * Opens the audio input and begins the listen loop.
   */
  async start(): Promise<void> {
    if (this.state !== VoiceState.Idle) {
      debugLogger.warn(
        `VoiceModeController.start() called in state "${this.state}", ignoring.`,
      );
      return;
    }

    debugLogger.log(
      `[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` +
        `sampleRate=${String(this.config.sampleRate ?? 16000)})`,
    );

    this.state = VoiceState.Listening;

    await this.audioInput.start(async (chunk) => {
      if (this.state !== VoiceState.Listening) return;

      try {
        this.state = VoiceState.Processing;
        const transcript = await this.stt.transcribe(chunk);

        if (transcript.trim().length === 0) {
          this.state = VoiceState.Listening;
          return;
        }

        debugLogger.log(`[voice] Transcript: "${transcript}"`);

        // In the future this is where the transcript feeds into the Gemini
        // conversation loop (GeminiClient.sendMessageStream). For now, we
        // echo it back through TTS as a proof-of-lifecycle.
        this.state = VoiceState.Speaking;
        await this.tts.speak(transcript);
      } catch (err) {
        debugLogger.error('[voice] Error in voice pipeline:', err);
        this.state = VoiceState.Error;
      } finally {
        if (
          this.state === VoiceState.Speaking ||
          this.state === VoiceState.Processing
        ) {
          this.state = VoiceState.Listening;
        }
      }
    });
  }

  /** Stop the voice session and release resources. */
  async stop(): Promise<void> {
    debugLogger.log('[voice] Stopping voice mode.');
    await this.tts.cancel();
    await this.audioInput.stop();
    this.state = VoiceState.Idle;
  }
}