Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
670a02f
deflake
Adib234 Feb 26, 2026
b734b75
update
Adib234 Feb 26, 2026
d01bd83
update
Adib234 Feb 26, 2026
944e1d1
update
Adib234 Feb 26, 2026
ad4bad2
address comment
Adib234 Feb 26, 2026
eae8145
update test
Adib234 Feb 26, 2026
d8f6232
test change
Adib234 Feb 26, 2026
b5258bb
debug
Adib234 Feb 27, 2026
03b8d24
update
Adib234 Feb 27, 2026
730cd41
debug
Adib234 Feb 27, 2026
7f67017
fix build error
Adib234 Feb 27, 2026
3ff6c40
Merge branch 'main' into adibakm/deflake-plan-mode
Adib234 Feb 27, 2026
c03d733
revert debugging
Adib234 Feb 27, 2026
6e248a3
build and format
Adib234 Feb 27, 2026
bb1bb26
add policy in integration test
Adib234 Mar 1, 2026
133fd7b
update policy in integration test
Adib234 Mar 1, 2026
9103d75
test change
Adib234 Mar 1, 2026
20a4af0
test
Adib234 Mar 1, 2026
6731df1
change test to interactive
Adib234 Mar 1, 2026
367b382
remove unused code
Adib234 Mar 1, 2026
c2993a1
disable interactive prompts at startup
Adib234 Mar 1, 2026
8b27136
Merge branch 'main' into adibakm/deflake-plan-mode
Adib234 Mar 1, 2026
e9f0515
revert deflake.yml
Adib234 Mar 1, 2026
487389b
remove unnecessary comment
Adib234 Mar 1, 2026
1c23919
split test into 2 tests
Adib234 Mar 1, 2026
caf1d5e
split test into 2 tests
Adib234 Mar 1, 2026
db14739
address nit
Adib234 Mar 2, 2026
d870519
revert deflake.yml
Adib234 Mar 2, 2026
487c23a
revert deflake.yml
Adib234 Mar 2, 2026
cab449d
revert deflake.yml
Adib234 Mar 2, 2026
51ad095
address lint
Adib234 Mar 2, 2026
bc2aff5
Merge branch 'main' into adibakm/deflake-plan-mode
Adib234 Mar 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 35 additions & 18 deletions .github/workflows/deflake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ on:
workflow_dispatch:
inputs:
branch_ref:
description: 'Branch to run on'
required: true
default: 'main'
description: 'Branch to run on (defaults to selected branch)'
required: false
type: 'string'
test_name_pattern:
description: 'The test name pattern to use'
test_filter:
description: 'Test name pattern to run (passed to vitest -t)'
required: false
default: 'should (allow|deny) write_file to (the plans|non-plans) directory in plan mode'
type: 'string'
runs:
description: 'The number of runs'
Expand All @@ -27,7 +27,6 @@ jobs:
deflake_e2e_linux:
name: 'E2E Test (Linux) - ${{ matrix.sandbox }}'
runs-on: 'gemini-cli-ubuntu-16-core'
if: "github.repository == 'google-gemini/gemini-cli'"
strategy:
fail-fast: false
matrix:
Expand All @@ -41,7 +40,7 @@ jobs:
- name: 'Checkout'
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
with:
ref: '${{ github.event.pull_request.head.sha }}'
ref: '${{ github.event.inputs.branch_ref || github.ref }}'
repository: '${{ github.repository }}'

- name: 'Set up Node.js ${{ matrix.node-version }}'
Expand All @@ -62,28 +61,37 @@ jobs:
- name: 'Run E2E tests'
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
DEBUG_SCHEDULER: 'true'
IS_DOCKER: "${{ matrix.sandbox == 'sandbox:docker' }}"
KEEP_OUTPUT: 'true'
RUNS: '${{ github.event.inputs.runs }}'
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
TEST_FILTER: '${{ github.event.inputs.test_filter }}'
VERBOSE: 'true'
shell: 'bash'
run: |
# Construct the inner command
if [[ "${IS_DOCKER}" == "true" ]]; then
npm run deflake:test:integration:sandbox:docker -- --runs="${RUNS}" -- --testNamePattern "'${TEST_NAME_PATTERN}'"
INNER_CMD="npm run test:integration:sandbox:docker -- --retry=0"
else
npm run deflake:test:integration:sandbox:none -- --runs="${RUNS}" -- --testNamePattern "'${TEST_NAME_PATTERN}'"
INNER_CMD="npm run test:integration:sandbox:none -- --retry=0"
fi

# Append test filter if provided
if [[ -n "${TEST_FILTER}" ]]; then
INNER_CMD="${INNER_CMD} -t \"${TEST_FILTER}\""
fi

# Run deflake script directly
node scripts/deflake.js --command="${INNER_CMD}" --runs="${RUNS}"

deflake_e2e_mac:
name: 'E2E Test (macOS)'
runs-on: 'macos-latest'
if: "github.repository == 'google-gemini/gemini-cli'"
steps:
- name: 'Checkout'
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
with:
ref: '${{ github.event.pull_request.head.sha }}'
ref: '${{ github.event.inputs.branch_ref || github.ref }}'
repository: '${{ github.repository }}'

- name: 'Set up Node.js 20.x'
Expand All @@ -105,24 +113,28 @@ jobs:
if: "runner.os != 'Windows'"
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
DEBUG_SCHEDULER: 'true'
KEEP_OUTPUT: 'true'
RUNS: '${{ github.event.inputs.runs }}'
SANDBOX: 'sandbox:none'
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
TEST_FILTER: '${{ github.event.inputs.test_filter }}'
VERBOSE: 'true'
run: |
npm run deflake:test:integration:sandbox:none -- --runs="${RUNS}" -- --testNamePattern "'${TEST_NAME_PATTERN}'"
INNER_CMD="npm run test:integration:sandbox:none -- --retry=0"
if [[ -n "${TEST_FILTER}" ]]; then
INNER_CMD="${INNER_CMD} -t \"${TEST_FILTER}\""
fi
node scripts/deflake.js --command="${INNER_CMD}" --runs="${RUNS}"

deflake_e2e_windows:
name: 'Slow E2E - Win'
runs-on: 'gemini-cli-windows-16-core'
if: "github.repository == 'google-gemini/gemini-cli'"

steps:
- name: 'Checkout'
uses: 'actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955' # ratchet:actions/checkout@v5
with:
ref: '${{ github.event.pull_request.head.sha }}'
ref: '${{ github.event.inputs.branch_ref || github.ref }}'
repository: '${{ github.repository }}'

- name: 'Set up Node.js 20.x'
Expand Down Expand Up @@ -160,14 +172,19 @@ jobs:
- name: 'Run E2E tests'
env:
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
DEBUG_SCHEDULER: 'true'
KEEP_OUTPUT: 'true'
SANDBOX: 'sandbox:none'
VERBOSE: 'true'
NODE_OPTIONS: '--max-old-space-size=32768 --max-semi-space-size=256'
UV_THREADPOOL_SIZE: '32'
NODE_ENV: 'test'
RUNS: '${{ github.event.inputs.runs }}'
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
TEST_FILTER: '${{ github.event.inputs.test_filter }}'
shell: 'pwsh'
run: |
npm run deflake:test:integration:sandbox:none -- --runs="$env:RUNS" -- --testNamePattern "'$env:TEST_NAME_PATTERN'"
$InnerCmd = "npm run test:integration:sandbox:none -- --retry=0"
if ($env:TEST_FILTER) {
$InnerCmd = "$InnerCmd -t `"$env:TEST_FILTER`""
}
node scripts/deflake.js --command "$InnerCmd" --runs $env:RUNS
110 changes: 83 additions & 27 deletions integration-tests/plan-mode.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
* SPDX-License-Identifier: Apache-2.0
*/

import { writeFileSync } from 'node:fs';
import { join } from 'node:path';
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { TestRig, checkModelOutputContent } from './test-helper.js';
import { TestRig, checkModelOutputContent, GEMINI_DIR } from './test-helper.js';

describe('Plan Mode', () => {
let rig: TestRig;
Expand Down Expand Up @@ -62,50 +64,98 @@ describe('Plan Mode', () => {
});
});

it.skip('should allow write_file only in the plans directory in plan mode', async () => {
await rig.setup(
'should allow write_file only in the plans directory in plan mode',
{
settings: {
experimental: { plan: true },
tools: {
core: ['write_file', 'read_file', 'list_directory'],
allowed: ['write_file'],
it('should allow write_file to the plans directory in plan mode', async () => {
const plansDir = '.gemini/tmp/v1/session/plans';
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this path can be confusing for someone trying to update plans, can we use a realistic path like:

Suggested change
const plansDir = '.gemini/tmp/v1/session/plans';
const plansDir = '.gemini/tmp/foo/123/plans';

same for other examples below

const testName =
'should allow write_file to the plans directory in plan mode';

await rig.setup(testName, {
settings: {
experimental: { plan: true },
tools: {
core: ['write_file', 'read_file', 'list_directory'],
},
general: {
defaultApprovalMode: 'plan',
plan: {
directory: plansDir,
},
general: { defaultApprovalMode: 'plan' },
},
},
});

// Disable the interactive terminal setup prompt in tests
writeFileSync(
join(rig.homeDir!, GEMINI_DIR, 'state.json'),
JSON.stringify({ terminalSetupPromptShown: true }, null, 2),
);

// We ask the agent to create a plan for a feature, which should trigger a write_file in the plans directory.
// Verify that write_file outside of plan directory fails
await rig.run({
const run = await rig.runInteractive({
approvalMode: 'plan',
stdin:
'Create a file called plan.md in the plans directory. Then create a file called hello.txt in the current directory',
});

const toolLogs = rig.readToolLogs();
const writeLogs = toolLogs.filter(
(l) => l.toolRequest.name === 'write_file',
await run.type('Create a file called plan.md in the plans directory.');
await run.type('\r');

await rig.expectToolCallSuccess(['write_file'], 30000, (args) =>
args.includes('plan.md'),
);

const planWrite = writeLogs.find(
const toolLogs = rig.readToolLogs();
const planWrite = toolLogs.find(
(l) =>
l.toolRequest.name === 'write_file' &&
l.toolRequest.args.includes('plans') &&
l.toolRequest.args.includes('plan.md'),
);
expect(planWrite?.toolRequest.success).toBe(true);
});

const blockedWrite = writeLogs.find((l) =>
l.toolRequest.args.includes('hello.txt'),
it('should deny write_file to non-plans directory in plan mode', async () => {
const plansDir = '.gemini/tmp/v1/session/plans';
const testName =
'should deny write_file to non-plans directory in plan mode';

await rig.setup(testName, {
settings: {
experimental: { plan: true },
tools: {
core: ['write_file', 'read_file', 'list_directory'],
},
general: {
defaultApprovalMode: 'plan',
plan: {
directory: plansDir,
},
},
},
});

// Disable the interactive terminal setup prompt in tests
writeFileSync(
join(rig.homeDir!, GEMINI_DIR, 'state.json'),
JSON.stringify({ terminalSetupPromptShown: true }, null, 2),
);

// Model is undeterministic, sometimes a blocked write appears in tool logs and sometimes it doesn't
if (blockedWrite) {
expect(blockedWrite?.toolRequest.success).toBe(false);
}
const run = await rig.runInteractive({
approvalMode: 'plan',
});

expect(planWrite?.toolRequest.success).toBe(true);
await run.type('Create a file called hello.txt in the current directory.');
await run.type('\r');

const toolLogs = rig.readToolLogs();
const writeLog = toolLogs.find(
(l) =>
l.toolRequest.name === 'write_file' &&
l.toolRequest.args.includes('hello.txt'),
);

// In Plan Mode, writes outside the plans directory should be blocked.
// Model is undeterministic, sometimes it doesn't even try, but if it does, it must fail.
if (writeLog) {
expect(writeLog.toolRequest.success).toBe(false);
}
});

it('should be able to enter plan mode from default mode', async () => {
Expand All @@ -119,6 +169,12 @@ describe('Plan Mode', () => {
},
});

// Disable the interactive terminal setup prompt in tests
writeFileSync(
join(rig.homeDir!, GEMINI_DIR, 'state.json'),
JSON.stringify({ terminalSetupPromptShown: true }, null, 2),
);

// Start in default mode and ask to enter plan mode.
await rig.run({
approvalMode: 'default',
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/policy/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import * as crypto from 'node:crypto';
import { fileURLToPath } from 'node:url';
import { Storage } from '../config/storage.js';
import {
ApprovalMode,
type PolicyEngineConfig,
PolicyDecision,
type PolicyRule,
ApprovalMode,
type PolicySettings,
type SafetyCheckerRule,
} from './types.js';
Expand Down
1 change: 1 addition & 0 deletions packages/test-utils/src/test-rig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import { fileURLToPath } from 'node:url';
import { env } from 'node:process';
import { setTimeout as sleep } from 'node:timers/promises';
import { DEFAULT_GEMINI_MODEL, GEMINI_DIR } from '@google/gemini-cli-core';
export { GEMINI_DIR };
import * as pty from '@lydell/node-pty';
import stripAnsi from 'strip-ansi';
import * as os from 'node:os';
Expand Down
Loading