Skip to content
Closed
48 changes: 38 additions & 10 deletions evals/app-test-helper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,44 @@ import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core';
* forbidden. Evals must test against the full, default tool set to ensure
* realistic behavior.
*/
interface EvalConfigOverrides {
/** Restricting tools via excludeTools in evals is forbidden. */
excludeTools?: never;
/** Restricting tools via coreTools in evals is forbidden. */
coreTools?: never;
/** Restricting tools via allowedTools in evals is forbidden. */
allowedTools?: never;
/** Restricting tools via mainAgentTools in evals is forbidden. */
mainAgentTools?: never;
type ForbiddenToolKeys =
| 'excludeTools'
| 'coreTools'
| 'allowedTools'
| 'mainAgentTools';

type NoToolRestrictions<T> = T & {
[K in ForbiddenToolKeys]?: never;
};

export type EvalConfigOverrides = NoToolRestrictions<{
[key: string]: unknown;
}>;

/**
* Runtime safety: remove forbidden tool restriction keys if present.
*/
function sanitizeConfigOverrides(
overrides?: Record<string, unknown>
): Record<string, unknown> | undefined {
if (!overrides) return overrides;

const forbidden: ForbiddenToolKeys[] = [
'excludeTools',
'coreTools',
'allowedTools',
'mainAgentTools',
];

const sanitized = { ...overrides };

for (const key of forbidden) {
if (key in sanitized) {
delete sanitized[key];
}
}

return sanitized;
}

export interface AppEvalCase {
Expand All @@ -51,7 +79,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
const rig = new AppRig({
configOverrides: {
model: DEFAULT_GEMINI_MODEL,
...evalCase.configOverrides,
...sanitizeConfigOverrides(evalCase.configOverrides),
},
});

Expand Down
3 changes: 2 additions & 1 deletion evals/cli_help_delegation.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ describe('CliHelpAgent Delegation', () => {
},
},
},
prompt: 'Help me create a subagent in this project',
// Refined prompt for clearer delegation intent
prompt: 'How do I create and configure a subagent using the CLI in this project?',
timeout: 60000,
assert: async (rig, _result) => {
const toolLogs = rig.readToolLogs();
Expand Down
21 changes: 17 additions & 4 deletions evals/generalist_delegation.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@
import { describe, expect } from 'vitest';
import { appEvalTest } from './app-test-helper.js';

/**
* Compile-time guard to prevent tool restriction usage in eval configs
*/
type ForbiddenToolKeys =
| 'excludeTools'
| 'coreTools'
| 'allowedTools'
| 'mainAgentTools';

type NoToolRestrictions<T> = T & {
[K in ForbiddenToolKeys]?: never;
};

describe('generalist_delegation', () => {
// --- Positive Evals (Should Delegate) ---

Expand All @@ -21,7 +34,7 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
},
} as NoToolRestrictions<typeof Object>,
files: {
'file1.ts': 'console.log("no semi")',
'file2.ts': 'console.log("no semi")',
Expand Down Expand Up @@ -64,7 +77,7 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
},
} as NoToolRestrictions<typeof Object>,
files: {
'src/a.ts': 'export const a = 1;',
'src/b.ts': 'export const b = 2;',
Expand Down Expand Up @@ -104,7 +117,7 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
},
} as NoToolRestrictions<typeof Object>,
files: {
'README.md': 'This is a proyect.',
},
Expand Down Expand Up @@ -138,7 +151,7 @@ describe('generalist_delegation', () => {
experimental: {
enableAgents: true,
},
},
} as NoToolRestrictions<typeof Object>,
files: {
'src/VERSION': '1.2.3',
},
Expand Down
30 changes: 22 additions & 8 deletions evals/grep_search_functionality.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ import {
checkModelOutputContent,
} from './test-helper.js';

function parseToolArgs(args: unknown): Record<string, any> | null {
if (args && typeof args === 'object') {
return args as Record<string, any>;
}
if (typeof args === 'string') {
try {
return JSON.parse(args) as Record<string, any>;
} catch {
return null;
}
}
return null;
}

describe('grep_search_functionality', () => {
const TEST_PREFIX = 'Grep Search Functionality: ';

Expand Down Expand Up @@ -44,8 +58,8 @@ describe('grep_search_functionality', () => {
'grep_search',
undefined,
(args) => {
const params = JSON.parse(args);
return params.case_sensitive === true;
const params = parseToolArgs(args);
return params?.case_sensitive === true;
},
);
expect(
Expand Down Expand Up @@ -74,8 +88,8 @@ describe('grep_search_functionality', () => {
'grep_search',
undefined,
(args) => {
const params = JSON.parse(args);
return params.names_only === true;
const params = parseToolArgs(args);
return params?.names_only === true;
},
);
expect(
Expand Down Expand Up @@ -104,8 +118,8 @@ describe('grep_search_functionality', () => {
'grep_search',
undefined,
(args) => {
const params = JSON.parse(args);
return params.include_pattern === '*.js';
const params = parseToolArgs(args);
return params?.include_pattern === '*.js';
},
);
expect(
Expand Down Expand Up @@ -134,8 +148,8 @@ describe('grep_search_functionality', () => {
'grep_search',
undefined,
(args) => {
const params = JSON.parse(args);
return params.dir_path === 'src';
const params = parseToolArgs(args);
return params?.dir_path === 'src';
},
);
expect(
Expand Down
17 changes: 15 additions & 2 deletions evals/model_steering.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,25 @@ import fs from 'node:fs';
import { appEvalTest } from './app-test-helper.js';
import { PolicyDecision } from '@google/gemini-cli-core';

/**
* Compile-time guard to prevent tool restriction usage in eval configs
*/
type ForbiddenToolKeys =
| 'excludeTools'
| 'coreTools'
| 'allowedTools'
| 'mainAgentTools';

type NoToolRestrictions<T> = T & {
[K in ForbiddenToolKeys]?: never;
};

describe('Model Steering Behavioral Evals', () => {
appEvalTest('USUALLY_PASSES', {
name: 'Corrective Hint: Model switches task based on hint during tool turn',
configOverrides: {
modelSteering: true,
},
} as NoToolRestrictions<Record<string, unknown>>,
files: {
'README.md':
'# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6',
Expand Down Expand Up @@ -55,7 +68,7 @@ describe('Model Steering Behavioral Evals', () => {
name: 'Suggestive Hint: Model incorporates user guidance mid-stream',
configOverrides: {
modelSteering: true,
},
} as NoToolRestrictions<Record<string, unknown>>,
files: {},
prompt: 'Create a file called "hw.js" with a JS hello world.',
setup: async (rig) => {
Expand Down
32 changes: 31 additions & 1 deletion evals/save_memory.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,23 @@ import {
checkModelOutputContent,
} from '../integration-tests/test-helper.js';

/**
* Compile-time guard to prevent tool restriction usage in eval configs
*/
type ForbiddenToolKeys =
| 'excludeTools'
| 'coreTools'
| 'allowedTools'
| 'mainAgentTools';

type NoToolRestrictions<T> = T & {
[K in ForbiddenToolKeys]?: never;
};

describe('save_memory', () => {
const TEST_PREFIX = 'Save memory test: ';
const rememberingFavoriteColor = "Agent remembers user's favorite color";

evalTest('ALWAYS_PASSES', {
name: rememberingFavoriteColor,

Expand All @@ -33,7 +47,9 @@ describe('save_memory', () => {
});
},
});

const rememberingCommandRestrictions = 'Agent remembers command restrictions';

evalTest('USUALLY_PASSES', {
name: rememberingCommandRestrictions,

Expand All @@ -53,6 +69,7 @@ describe('save_memory', () => {
});

const rememberingWorkflow = 'Agent remembers workflow preferences';

evalTest('USUALLY_PASSES', {
name: rememberingWorkflow,

Expand All @@ -73,6 +90,7 @@ describe('save_memory', () => {

const ignoringTemporaryInformation =
'Agent ignores temporary conversation details';

evalTest('ALWAYS_PASSES', {
name: ignoringTemporaryInformation,

Expand All @@ -82,6 +100,7 @@ describe('save_memory', () => {
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');

expect(
wasToolCalled,
'save_memory should not be called for temporary information',
Expand All @@ -96,6 +115,7 @@ describe('save_memory', () => {
});

const rememberingPetName = "Agent remembers user's pet's name";

evalTest('ALWAYS_PASSES', {
name: rememberingPetName,

Expand All @@ -115,6 +135,7 @@ describe('save_memory', () => {
});

const rememberingCommandAlias = 'Agent remembers custom command aliases';

evalTest('ALWAYS_PASSES', {
name: rememberingCommandAlias,

Expand All @@ -135,6 +156,7 @@ describe('save_memory', () => {

const ignoringDbSchemaLocation =
"Agent ignores workspace's database schema location";

evalTest('USUALLY_PASSES', {
name: ignoringDbSchemaLocation,
prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`,
Expand All @@ -143,6 +165,7 @@ describe('save_memory', () => {
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');

expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
Expand All @@ -154,6 +177,7 @@ describe('save_memory', () => {

const rememberingCodingStyle =
"Agent remembers user's coding style preference";

evalTest('ALWAYS_PASSES', {
name: rememberingCodingStyle,

Expand All @@ -174,6 +198,7 @@ describe('save_memory', () => {

const ignoringBuildArtifactLocation =
'Agent ignores workspace build artifact location';

evalTest('USUALLY_PASSES', {
name: ignoringBuildArtifactLocation,
prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`,
Expand All @@ -182,6 +207,7 @@ describe('save_memory', () => {
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');

expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
Expand All @@ -191,7 +217,9 @@ describe('save_memory', () => {
},
});

const ignoringMainEntryPoint = "Agent ignores workspace's main entry point";
const ignoringMainEntryPoint =
"Agent ignores workspace's main entry point";

evalTest('USUALLY_PASSES', {
name: ignoringMainEntryPoint,
prompt: `The main entry point for this workspace is \`src/index.js\`.`,
Expand All @@ -200,6 +228,7 @@ describe('save_memory', () => {
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');

expect(
wasToolCalled,
'save_memory should not be called for workspace-specific information',
Expand All @@ -210,6 +239,7 @@ describe('save_memory', () => {
});

const rememberingBirthday = "Agent remembers user's birthday";

evalTest('ALWAYS_PASSES', {
name: rememberingBirthday,

Expand Down
Loading