diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000000..17ef799075d --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,31 @@ +{ + "permissions": { + "allow": [ + "Bash(done)", + "Bash(xargs grep -l \"mode\")", + "Bash(xargs grep -l \"hook\\\\|middleware\\\\|plugin\")", + "Bash(xargs grep -l \"mode\\\\|Mode\")", + "Bash(xargs grep -l \"SecurityModel\\\\|sandbox\")", + "Bash(xargs -I {} bash -c 'echo \"\"=== {} ===\"\" && head -50 \"\"{}\"\"')", + "Bash(node --version)", + "Bash(npm install)", + "Bash(npm run build)", + "Bash(npm start)", + "Bash(node packages/cli/bundle/gemini.js --version)", + "Bash(node bundle/gemini.js --version)", + "Bash(node bundle/gemini.js --help)", + "Bash(npm run build --workspace=@google/gemini-cli-core)", + "Bash(npm run build --workspace=@google/gemini-cli)", + "Bash(node bundle/gemini.js --voice)", + "Bash(npm run bundle)", + "Bash(npm start -- --voice)", + "Bash(node bundle/gemini.js)", + "Bash(echo \"EXIT CODE: $?\")", + "Bash(npm run test --workspace=@google/gemini-cli-core)", + "Bash(npm run test --workspace=@google/gemini-cli)", + "Bash(npx vitest run packages/cli/src/gemini.test.tsx)", + "Bash(git checkout -- 'packages/cli/src/ui/components/__snapshots__/ConfigInitDisplay.test.tsx.snap')", + "Bash(git checkout -- package-lock.json)" + ] + } +} diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts index a8c85975e93..c64da568b84 100755 --- a/packages/cli/src/config/config.ts +++ b/packages/cli/src/config/config.ts @@ -78,6 +78,7 @@ export interface CliArgs { allowedTools: string[] | undefined; acp?: boolean; experimentalAcp?: boolean; + experimentalVoice: boolean | undefined; extensions: string[] | undefined; listExtensions: boolean | undefined; resume: string | typeof RESUME_LATEST | undefined; @@ -182,6 +183,11 @@ export async function parseArguments( description: 'Starts the agent in ACP mode (deprecated, use --acp instead)', }) + .option('experimental-voice', { + type: 'boolean', + description: '[Experimental] Start in hands-free voice mode', + hidden: true, + }) .option('allowed-mcp-server-names', { type: 'array', string: true, @@ -758,7 +764,7 @@ export async function loadCliConfig( bugCommand: settings.advanced?.bugCommand, model: resolvedModel, maxSessionTurns: settings.model?.maxSessionTurns, - + experimentalVoice: argv.experimentalVoice || false, listExtensions: argv.listExtensions || false, listSessions: argv.listSessions || false, deleteSession: argv.deleteSession, diff --git a/packages/cli/src/gemini.test.tsx b/packages/cli/src/gemini.test.tsx index 90c63651e78..ce175e8a3d1 100644 --- a/packages/cli/src/gemini.test.tsx +++ b/packages/cli/src/gemini.test.tsx @@ -484,6 +484,7 @@ describe('gemini.tsx main function kitty protocol', () => { allowedMcpServerNames: undefined, allowedTools: undefined, experimentalAcp: undefined, + experimentalVoice: undefined, extensions: undefined, listExtensions: undefined, includeDirectories: undefined, diff --git a/packages/cli/src/gemini.tsx b/packages/cli/src/gemini.tsx index 331ec0c0189..ed396ba86ed 100644 --- a/packages/cli/src/gemini.tsx +++ b/packages/cli/src/gemini.tsx @@ -676,6 +676,15 @@ export async function main() { return runAcpClient(config, settings, argv); } + if (config.getExperimentalVoice()) { + writeToStderr( + '[experimental] Voice mode is not yet implemented. ' + + 'The --voice flag registers the architectural skeleton only.\n', + ); + await runExitCleanup(); + process.exit(ExitCodes.SUCCESS); + } + let input = config.getQuestion(); const useAlternateBuffer = shouldEnterAlternateScreen( isAlternateBufferEnabled(config), diff --git a/packages/cli/src/gemini_cleanup.test.tsx b/packages/cli/src/gemini_cleanup.test.tsx index 536da027d4f..8776013c88a 100644 --- a/packages/cli/src/gemini_cleanup.test.tsx +++ b/packages/cli/src/gemini_cleanup.test.tsx @@ -217,6 +217,7 @@ describe('gemini.tsx main function cleanup', () => { getMcpClientManager: vi.fn(), getIdeMode: vi.fn(() => false), getAcpMode: vi.fn(() => true), + getExperimentalVoice: vi.fn(() => false), getScreenReader: vi.fn(() => false), getGeminiMdFileCount: vi.fn(() => 0), getProjectRoot: vi.fn(() => '/'), diff --git a/packages/cli/src/test-utils/mockConfig.ts b/packages/cli/src/test-utils/mockConfig.ts index c8ab45a35d5..bba6c6cec84 100644 --- a/packages/cli/src/test-utils/mockConfig.ts +++ b/packages/cli/src/test-utils/mockConfig.ts @@ -43,6 +43,7 @@ export const createMockConfig = (overrides: Partial = {}): Config => getSessionId: vi.fn().mockReturnValue('mock-session-id'), getContentGeneratorConfig: vi.fn(() => ({ authType: 'google' })), getAcpMode: vi.fn(() => false), + getExperimentalVoice: vi.fn(() => false), isBrowserLaunchSuppressed: vi.fn(() => false), setRemoteAdminSettings: vi.fn(), isYoloModeDisabled: vi.fn(() => false), diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index e3201aa5213..279ff912543 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -516,6 +516,7 @@ export interface ConfigParameters { disableLoopDetection?: boolean; maxSessionTurns?: number; acpMode?: boolean; + experimentalVoice?: boolean; listSessions?: boolean; deleteSession?: string; listExtensions?: boolean; @@ -715,6 +716,7 @@ export class Config implements McpContext { | Record | undefined; private readonly acpMode: boolean = false; + private readonly experimentalVoice: boolean = false; private readonly loadMemoryFromIncludeDirectories: boolean = false; private readonly includeDirectoryTree: boolean = true; private readonly importFormat: 'tree' | 'flat'; @@ -912,6 +914,7 @@ export class Config implements McpContext { }; this.maxSessionTurns = params.maxSessionTurns ?? -1; this.acpMode = params.acpMode ?? false; + this.experimentalVoice = params.experimentalVoice ?? false; this.listSessions = params.listSessions ?? false; this.deleteSession = params.deleteSession; this.listExtensions = params.listExtensions ?? false; @@ -2255,10 +2258,8 @@ export class Config implements McpContext { return this.acpMode; } - async waitForMcpInit(): Promise { - if (this.mcpInitializationPromise) { - await this.mcpInitializationPromise; - } + getExperimentalVoice(): boolean { + return this.experimentalVoice; } getListExtensions(): boolean { diff --git a/packages/core/src/voice/types.ts b/packages/core/src/voice/types.ts new file mode 100644 index 00000000000..a55dad0b178 --- /dev/null +++ b/packages/core/src/voice/types.ts @@ -0,0 +1,67 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Contracts for the Hands-Free Voice Mode pipeline. + * + * Architecture: + * Mic → AudioInputProvider → SpeechToTextAdapter → [Gemini API] → TextToSpeechAdapter → Speaker + * + * Each interface is designed to be swappable — the initial implementation + * will use no-op stubs, and real backends (native audio, WebSocket bridges, + * MCP audio servers, Gemini Live API) can be plugged in later. + */ + +/** PCM audio chunk emitted by an AudioInputProvider. */ +export interface AudioChunk { + /** Raw PCM sample data. */ + readonly samples: Buffer; + /** Sample rate in Hz (e.g. 16000). */ + readonly sampleRate: number; + /** Number of audio channels (1 = mono, 2 = stereo). */ + readonly channels: number; +} + +/** Captures audio from a microphone or other input device. */ +export interface AudioInputProvider { + /** Begin capturing audio. Implementations should emit chunks via the callback. */ + start(onChunk: (chunk: AudioChunk) => void): Promise; + /** Stop capturing and release resources. */ + stop(): Promise; + /** Whether the provider is currently capturing. */ + isActive(): boolean; +} + +/** Converts an audio chunk to text (speech-to-text). */ +export interface SpeechToTextAdapter { + /** Transcribe a single audio chunk. Returns the transcribed text. */ + transcribe(chunk: AudioChunk): Promise; +} + +/** Converts text to audible speech (text-to-speech). */ +export interface TextToSpeechAdapter { + /** Synthesize text into audio and play it back. Resolves when playback ends. */ + speak(text: string): Promise; + /** Interrupt any in-progress playback. */ + cancel(): Promise; +} + +/** Configuration for a voice session. */ +export interface VoiceSessionConfig { + /** Sample rate in Hz for audio capture (default: 16000). */ + sampleRate?: number; + /** Locale/language code for STT/TTS (e.g. "en-US"). */ + locale?: string; +} + +/** Lifecycle states for the voice mode controller. */ +export enum VoiceState { + Idle = 'idle', + Listening = 'listening', + Processing = 'processing', + Speaking = 'speaking', + Error = 'error', +} diff --git a/packages/core/src/voice/voiceModeController.ts b/packages/core/src/voice/voiceModeController.ts new file mode 100644 index 00000000000..848d5a4ebc2 --- /dev/null +++ b/packages/core/src/voice/voiceModeController.ts @@ -0,0 +1,108 @@ +/** + * @license + * Copyright 2026 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { debugLogger } from '../utils/debugLogger.js'; +import type { + AudioInputProvider, + SpeechToTextAdapter, + TextToSpeechAdapter, + VoiceSessionConfig, +} from './types.js'; +import { VoiceState } from './types.js'; + +/** + * Orchestrates the voice mode lifecycle. + * + * Wires together an AudioInputProvider, SpeechToTextAdapter, and + * TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop. + * + * This is a skeleton — real audio backends will be injected later. + * The controller is intentionally thin so it can be tested without hardware. + */ +export class VoiceModeController { + private state: VoiceState = VoiceState.Idle; + private readonly audioInput: AudioInputProvider; + private readonly stt: SpeechToTextAdapter; + private readonly tts: TextToSpeechAdapter; + private readonly config: VoiceSessionConfig; + + constructor( + audioInput: AudioInputProvider, + stt: SpeechToTextAdapter, + tts: TextToSpeechAdapter, + config: VoiceSessionConfig = {}, + ) { + this.audioInput = audioInput; + this.stt = stt; + this.tts = tts; + this.config = config; + } + + /** Current lifecycle state. */ + getState(): VoiceState { + return this.state; + } + + /** + * Start the voice session. + * Opens the audio input and begins the listen loop. + */ + async start(): Promise { + if (this.state !== VoiceState.Idle) { + debugLogger.warn( + `VoiceModeController.start() called in state "${this.state}", ignoring.`, + ); + return; + } + + debugLogger.log( + `[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` + + `sampleRate=${String(this.config.sampleRate ?? 16000)})`, + ); + + this.state = VoiceState.Listening; + + await this.audioInput.start(async (chunk) => { + if (this.state !== VoiceState.Listening) return; + + try { + this.state = VoiceState.Processing; + const transcript = await this.stt.transcribe(chunk); + + if (transcript.trim().length === 0) { + this.state = VoiceState.Listening; + return; + } + + debugLogger.log(`[voice] Transcript: "${transcript}"`); + + // In the future this is where the transcript feeds into the Gemini + // conversation loop (GeminiClient.sendMessageStream). For now, we + // echo it back through TTS as a proof-of-lifecycle. + this.state = VoiceState.Speaking; + await this.tts.speak(transcript); + } catch (err) { + debugLogger.error('[voice] Error in voice pipeline:', err); + this.state = VoiceState.Error; + } finally { + if ( + this.state === VoiceState.Speaking || + this.state === VoiceState.Processing + ) { + this.state = VoiceState.Listening; + } + } + }); + } + + /** Stop the voice session and release resources. */ + async stop(): Promise { + debugLogger.log('[voice] Stopping voice mode.'); + await this.tts.cancel(); + await this.audioInput.stop(); + this.state = VoiceState.Idle; + } +}