Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"permissions": {
"allow": [
"Bash(done)",
"Bash(xargs grep -l \"mode\")",
"Bash(xargs grep -l \"hook\\\\|middleware\\\\|plugin\")",
"Bash(xargs grep -l \"mode\\\\|Mode\")",
"Bash(xargs grep -l \"SecurityModel\\\\|sandbox\")",
"Bash(xargs -I {} bash -c 'echo \"\"=== {} ===\"\" && head -50 \"\"{}\"\"')",
"Bash(node --version)",
"Bash(npm install)",
"Bash(npm run build)",
"Bash(npm start)",
"Bash(node packages/cli/bundle/gemini.js --version)",
"Bash(node bundle/gemini.js --version)",
"Bash(node bundle/gemini.js --help)",
"Bash(npm run build --workspace=@google/gemini-cli-core)",
"Bash(npm run build --workspace=@google/gemini-cli)",
"Bash(node bundle/gemini.js --voice)",
"Bash(npm run bundle)",
"Bash(npm start -- --voice)",
"Bash(node bundle/gemini.js)",
"Bash(echo \"EXIT CODE: $?\")",
"Bash(npm run test --workspace=@google/gemini-cli-core)",
"Bash(npm run test --workspace=@google/gemini-cli)",
"Bash(npx vitest run packages/cli/src/gemini.test.tsx)",
"Bash(git checkout -- 'packages/cli/src/ui/components/__snapshots__/ConfigInitDisplay.test.tsx.snap')",
"Bash(git checkout -- package-lock.json)"
]
}
}
8 changes: 7 additions & 1 deletion packages/cli/src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ export interface CliArgs {
allowedTools: string[] | undefined;
acp?: boolean;
experimentalAcp?: boolean;
experimentalVoice: boolean | undefined;
extensions: string[] | undefined;
listExtensions: boolean | undefined;
resume: string | typeof RESUME_LATEST | undefined;
Expand Down Expand Up @@ -182,6 +183,11 @@ export async function parseArguments(
description:
'Starts the agent in ACP mode (deprecated, use --acp instead)',
})
.option('experimental-voice', {
type: 'boolean',
description: '[Experimental] Start in hands-free voice mode',
hidden: true,
})
.option('allowed-mcp-server-names', {
type: 'array',
string: true,
Expand Down Expand Up @@ -758,7 +764,7 @@ export async function loadCliConfig(
bugCommand: settings.advanced?.bugCommand,
model: resolvedModel,
maxSessionTurns: settings.model?.maxSessionTurns,

experimentalVoice: argv.experimentalVoice || false,
listExtensions: argv.listExtensions || false,
listSessions: argv.listSessions || false,
deleteSession: argv.deleteSession,
Expand Down
1 change: 1 addition & 0 deletions packages/cli/src/gemini.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ describe('gemini.tsx main function kitty protocol', () => {
allowedMcpServerNames: undefined,
allowedTools: undefined,
experimentalAcp: undefined,
experimentalVoice: undefined,
extensions: undefined,
listExtensions: undefined,
includeDirectories: undefined,
Expand Down
9 changes: 9 additions & 0 deletions packages/cli/src/gemini.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,15 @@ export async function main() {
return runAcpClient(config, settings, argv);
}

if (config.getExperimentalVoice()) {
writeToStderr(
'[experimental] Voice mode is not yet implemented. ' +
'The --voice flag registers the architectural skeleton only.\n',
);
await runExitCleanup();
process.exit(ExitCodes.SUCCESS);
}

let input = config.getQuestion();
const useAlternateBuffer = shouldEnterAlternateScreen(
isAlternateBufferEnabled(config),
Expand Down
1 change: 1 addition & 0 deletions packages/cli/src/gemini_cleanup.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ describe('gemini.tsx main function cleanup', () => {
getMcpClientManager: vi.fn(),
getIdeMode: vi.fn(() => false),
getAcpMode: vi.fn(() => true),
getExperimentalVoice: vi.fn(() => false),
getScreenReader: vi.fn(() => false),
getGeminiMdFileCount: vi.fn(() => 0),
getProjectRoot: vi.fn(() => '/'),
Expand Down
1 change: 1 addition & 0 deletions packages/cli/src/test-utils/mockConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ export const createMockConfig = (overrides: Partial<Config> = {}): Config =>
getSessionId: vi.fn().mockReturnValue('mock-session-id'),
getContentGeneratorConfig: vi.fn(() => ({ authType: 'google' })),
getAcpMode: vi.fn(() => false),
getExperimentalVoice: vi.fn(() => false),
isBrowserLaunchSuppressed: vi.fn(() => false),
setRemoteAdminSettings: vi.fn(),
isYoloModeDisabled: vi.fn(() => false),
Expand Down
9 changes: 5 additions & 4 deletions packages/core/src/config/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ export interface ConfigParameters {
disableLoopDetection?: boolean;
maxSessionTurns?: number;
acpMode?: boolean;
experimentalVoice?: boolean;
listSessions?: boolean;
deleteSession?: string;
listExtensions?: boolean;
Expand Down Expand Up @@ -715,6 +716,7 @@ export class Config implements McpContext {
| Record<string, SummarizeToolOutputSettings>
| undefined;
private readonly acpMode: boolean = false;
private readonly experimentalVoice: boolean = false;
private readonly loadMemoryFromIncludeDirectories: boolean = false;
private readonly includeDirectoryTree: boolean = true;
private readonly importFormat: 'tree' | 'flat';
Expand Down Expand Up @@ -912,6 +914,7 @@ export class Config implements McpContext {
};
this.maxSessionTurns = params.maxSessionTurns ?? -1;
this.acpMode = params.acpMode ?? false;
this.experimentalVoice = params.experimentalVoice ?? false;
this.listSessions = params.listSessions ?? false;
this.deleteSession = params.deleteSession;
this.listExtensions = params.listExtensions ?? false;
Expand Down Expand Up @@ -2255,10 +2258,8 @@ export class Config implements McpContext {
return this.acpMode;
}

async waitForMcpInit(): Promise<void> {
if (this.mcpInitializationPromise) {
await this.mcpInitializationPromise;
}
getExperimentalVoice(): boolean {
return this.experimentalVoice;
}

getListExtensions(): boolean {
Expand Down
67 changes: 67 additions & 0 deletions packages/core/src/voice/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

/**
* Contracts for the Hands-Free Voice Mode pipeline.
*
* Architecture:
* Mic → AudioInputProvider → SpeechToTextAdapter → [Gemini API] → TextToSpeechAdapter → Speaker
*
* Each interface is designed to be swappable — the initial implementation
* will use no-op stubs, and real backends (native audio, WebSocket bridges,
* MCP audio servers, Gemini Live API) can be plugged in later.
*/

/** PCM audio chunk emitted by an AudioInputProvider. */
export interface AudioChunk {
/** Raw PCM sample data. */
readonly samples: Buffer;
/** Sample rate in Hz (e.g. 16000). */
readonly sampleRate: number;
/** Number of audio channels (1 = mono, 2 = stereo). */
readonly channels: number;
}

/** Captures audio from a microphone or other input device. */
export interface AudioInputProvider {
/** Begin capturing audio. Implementations should emit chunks via the callback. */
start(onChunk: (chunk: AudioChunk) => void): Promise<void>;
/** Stop capturing and release resources. */
stop(): Promise<void>;
/** Whether the provider is currently capturing. */
isActive(): boolean;
}

/** Converts an audio chunk to text (speech-to-text). */
export interface SpeechToTextAdapter {
/** Transcribe a single audio chunk. Returns the transcribed text. */
transcribe(chunk: AudioChunk): Promise<string>;
}

/** Converts text to audible speech (text-to-speech). */
export interface TextToSpeechAdapter {
/** Synthesize text into audio and play it back. Resolves when playback ends. */
speak(text: string): Promise<void>;
/** Interrupt any in-progress playback. */
cancel(): Promise<void>;
}

/** Configuration for a voice session. */
export interface VoiceSessionConfig {
/** Sample rate in Hz for audio capture (default: 16000). */
sampleRate?: number;
/** Locale/language code for STT/TTS (e.g. "en-US"). */
locale?: string;
}

/** Lifecycle states for the voice mode controller. */
export enum VoiceState {
Idle = 'idle',
Listening = 'listening',
Processing = 'processing',
Speaking = 'speaking',
Error = 'error',
}
108 changes: 108 additions & 0 deletions packages/core/src/voice/voiceModeController.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/**
* @license
* Copyright 2026 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import { debugLogger } from '../utils/debugLogger.js';
import type {
AudioInputProvider,
SpeechToTextAdapter,
TextToSpeechAdapter,
VoiceSessionConfig,
} from './types.js';
import { VoiceState } from './types.js';

/**
* Orchestrates the voice mode lifecycle.
*
* Wires together an AudioInputProvider, SpeechToTextAdapter, and
* TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop.
*
* This is a skeleton — real audio backends will be injected later.
* The controller is intentionally thin so it can be tested without hardware.
*/
export class VoiceModeController {
private state: VoiceState = VoiceState.Idle;
private readonly audioInput: AudioInputProvider;
private readonly stt: SpeechToTextAdapter;
private readonly tts: TextToSpeechAdapter;
private readonly config: VoiceSessionConfig;

constructor(
audioInput: AudioInputProvider,
stt: SpeechToTextAdapter,
tts: TextToSpeechAdapter,
config: VoiceSessionConfig = {},
) {
this.audioInput = audioInput;
this.stt = stt;
this.tts = tts;
this.config = config;
}

/** Current lifecycle state. */
getState(): VoiceState {
return this.state;
}

/**
* Start the voice session.
* Opens the audio input and begins the listen loop.
*/
async start(): Promise<void> {
if (this.state !== VoiceState.Idle) {
debugLogger.warn(
`VoiceModeController.start() called in state "${this.state}", ignoring.`,
);
return;
}

debugLogger.log(
`[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` +
`sampleRate=${String(this.config.sampleRate ?? 16000)})`,
);

this.state = VoiceState.Listening;

await this.audioInput.start(async (chunk) => {
if (this.state !== VoiceState.Listening) return;

try {
this.state = VoiceState.Processing;
const transcript = await this.stt.transcribe(chunk);

if (transcript.trim().length === 0) {
this.state = VoiceState.Listening;
return;
}

debugLogger.log(`[voice] Transcript: "${transcript}"`);

// In the future this is where the transcript feeds into the Gemini
// conversation loop (GeminiClient.sendMessageStream). For now, we
// echo it back through TTS as a proof-of-lifecycle.
this.state = VoiceState.Speaking;
await this.tts.speak(transcript);
} catch (err) {
debugLogger.error('[voice] Error in voice pipeline:', err);
this.state = VoiceState.Error;
} finally {
if (
this.state === VoiceState.Speaking ||
this.state === VoiceState.Processing
) {
this.state = VoiceState.Listening;
}
}
});
}

/** Stop the voice session and release resources. */
async stop(): Promise<void> {
debugLogger.log('[voice] Stopping voice mode.');
await this.tts.cancel();
await this.audioInput.stop();
this.state = VoiceState.Idle;
}
}