-
Notifications
You must be signed in to change notification settings - Fork 13k
feat: Add voice input with pluggable backend (Gemini zero-install + Whisper) #18499
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 46 commits
135dbd9
82a93d6
deb7f73
97b657c
3c1f6bf
5a44007
6079a04
ea161cf
933af01
0c21255
1736fcd
3bd199e
86b703e
bcd63c3
21fdfc6
7c98772
ff0b985
3d26763
cee638f
24c72b0
911eaf6
bc05702
bd1665f
020b1da
1082384
4ac4fd8
f58ed15
694f6d8
e92cc6e
5392617
f59415e
365fdd1
a48c73d
4546044
b296dd5
e71bc52
656531b
2eb05cd
c71e42a
faa6581
d6b147b
10e32ce
34c7a0f
eb1757e
19e4b56
9353a19
96bee19
c0763b5
91b174a
e316d6e
da0d3b5
1677dae
27e7c82
3078d2a
f00d05c
1fbc052
615ce40
fc5aff7
2f89ac2
414674f
d2ed487
c87dc06
52b8352
77b5dbe
94739e1
015d28e
a2f98f5
25b4781
0fab703
644d4dc
b3305c5
e9a3156
70a4a4c
1b28c60
c8ffca7
a05117f
a72d219
d77d640
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -816,6 +816,63 @@ const SETTINGS_SCHEMA = { | |
| }, | ||
| }, | ||
|
|
||
| voice: { | ||
| type: 'object', | ||
| label: 'Voice Input', | ||
| category: 'General', | ||
| requiresRestart: false, | ||
| default: {}, | ||
| description: | ||
| 'Settings for voice input. Note: Voice input is not natively supported in WSL2 (Windows Subsystem for Linux).', | ||
| showInDialog: false, | ||
| properties: { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add showInDialog: false,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done — added |
||
| enabled: { | ||
| type: 'boolean', | ||
| label: 'Enable Voice Input', | ||
| category: 'General', | ||
| requiresRestart: false, | ||
| default: false, | ||
| description: 'Enable voice input support.', | ||
| showInDialog: true, | ||
| }, | ||
| provider: { | ||
| type: 'enum', | ||
| label: 'Transcription Backend', | ||
| category: 'General', | ||
| requiresRestart: false, | ||
| default: 'gemini', | ||
| description: | ||
| 'Transcription backend: "gemini" (default, zero-install) or "whisper" (local).', | ||
| showInDialog: true, | ||
| options: [ | ||
| { value: 'gemini', label: 'Gemini (Cloud)' }, | ||
| { value: 'whisper', label: 'Whisper (Local)' }, | ||
| ], | ||
| }, | ||
| whisperPath: { | ||
| type: 'string', | ||
| label: 'Whisper Binary Path', | ||
| category: 'General', | ||
| requiresRestart: false, | ||
| default: undefined as string | undefined, | ||
| description: | ||
| 'Path to the whisper executable. Only used when provider is "whisper".', | ||
| showInDialog: true, | ||
| }, | ||
| silenceThreshold: { | ||
| type: 'number', | ||
| label: 'Silence Detection Threshold', | ||
| category: 'General', | ||
| requiresRestart: false, | ||
| default: 80, | ||
| description: | ||
| 'RMS energy threshold (0–1000) below which audio is discarded as silence. ' + | ||
| 'Lower values allow quieter speech such as whispering. 0 disables silence detection.', | ||
| showInDialog: true, | ||
| }, | ||
| }, | ||
| }, | ||
|
|
||
| ide: { | ||
| type: 'object', | ||
| label: 'IDE', | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,84 @@ | ||
| /** | ||
| * @license | ||
| * Copyright 2026 Google LLC | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| import { afterEach, describe, expect, it } from 'vitest'; | ||
| import { TestRig } from '@google/gemini-cli-test-utils'; | ||
| import path from 'node:path'; | ||
| import { fileURLToPath } from 'node:url'; | ||
| import stripAnsi from 'strip-ansi'; | ||
|
|
||
| const __dirname = path.dirname(fileURLToPath(import.meta.url)); | ||
|
|
||
| describe.skipIf(process.platform === 'win32')('Voice Whisper PTY repro', () => { | ||
| let rig: TestRig | undefined; | ||
|
|
||
| afterEach(async () => { | ||
| await rig?.cleanup(); | ||
| }); | ||
|
|
||
| it('repro: whisper can remain stuck in "Speak now..." after Esc in a real PTY session', async () => { | ||
| const fakeResponsesPath = path.join( | ||
| __dirname, | ||
| '../test-utils/fixtures/simple.responses', | ||
| ); | ||
|
|
||
| rig = new TestRig(); | ||
| rig.setup('voice-whisper-pty-repro', { | ||
| fakeResponsesPath, | ||
| }); | ||
|
|
||
| const ignoreSigintScript = rig.createScript( | ||
| 'ignore-sigint.js', | ||
| [ | ||
| "process.on('SIGINT', () => {});", | ||
| 'setInterval(() => {}, 1000);', | ||
| '', | ||
| ].join('\n'), | ||
| ); | ||
|
|
||
| rig.createScript( | ||
| 'sox', | ||
| [ | ||
| '#!/usr/bin/env bash', | ||
| `exec "${process.execPath}" "${ignoreSigintScript}" "$@"`, | ||
| '', | ||
| ].join('\n'), | ||
| ); | ||
|
|
||
| // Make the fake recorder executable. | ||
| const fs = await import('node:fs/promises'); | ||
| await fs.chmod(path.join(rig.testDir!, 'sox'), 0o755); | ||
|
|
||
| const run = await rig.runInteractive({ | ||
| env: { | ||
| PATH: `${rig.testDir}:${process.env['PATH'] ?? ''}`, | ||
| GEMINI_API_KEY: 'test-key', | ||
| }, | ||
| }); | ||
|
|
||
| const submitCommand = async (command: string) => { | ||
| await run.sendKeys(command); | ||
| await new Promise((resolve) => setTimeout(resolve, 75)); | ||
| await run.sendKeys('\r'); | ||
| }; | ||
|
|
||
| await submitCommand('/voice provider whisper'); | ||
| await run.expectText('Voice transcription backend set to: whisper', 10000); | ||
|
|
||
| await submitCommand('/voice enable'); | ||
| await run.expectText('Voice input enabled', 10000); | ||
|
|
||
| await run.sendText(' '); | ||
| await new Promise((resolve) => setTimeout(resolve, 120)); | ||
| await run.sendText(' '); | ||
| await run.expectText('Speak now...', 10000); | ||
|
|
||
| await run.sendText('\u001B'); | ||
| await new Promise((resolve) => setTimeout(resolve, 500)); | ||
|
|
||
| expect(stripAnsi(run.output)).toContain('Speak now...'); | ||
| }, 60000); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lets add an additional setting that indicates whether voice support is enabled or not.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done —
voice.enabledis implemented insettingsSchema.tsand read inAppContainer.tsx. Whenfalse, the backend is not initialized and keyboard shortcuts are suppressed entirely (the key handler now checksisEnabledbefore consuming the keypress).