diff --git a/evals/app-test-helper.ts b/evals/app-test-helper.ts index 2bcff419242..b1f8aa6e7a9 100644 --- a/evals/app-test-helper.ts +++ b/evals/app-test-helper.ts @@ -20,16 +20,44 @@ import { DEFAULT_GEMINI_MODEL } from '@google/gemini-cli-core'; * forbidden. Evals must test against the full, default tool set to ensure * realistic behavior. */ -interface EvalConfigOverrides { - /** Restricting tools via excludeTools in evals is forbidden. */ - excludeTools?: never; - /** Restricting tools via coreTools in evals is forbidden. */ - coreTools?: never; - /** Restricting tools via allowedTools in evals is forbidden. */ - allowedTools?: never; - /** Restricting tools via mainAgentTools in evals is forbidden. */ - mainAgentTools?: never; +type ForbiddenToolKeys = + | 'excludeTools' + | 'coreTools' + | 'allowedTools' + | 'mainAgentTools'; + +type NoToolRestrictions = T & { + [K in ForbiddenToolKeys]?: never; +}; + +export type EvalConfigOverrides = NoToolRestrictions<{ [key: string]: unknown; +}>; + +/** + * Runtime safety: remove forbidden tool restriction keys if present. + */ +function sanitizeConfigOverrides( + overrides?: Record +): Record | undefined { + if (!overrides) return overrides; + + const forbidden: ForbiddenToolKeys[] = [ + 'excludeTools', + 'coreTools', + 'allowedTools', + 'mainAgentTools', + ]; + + const sanitized = { ...overrides }; + + for (const key of forbidden) { + if (key in sanitized) { + delete sanitized[key]; + } + } + + return sanitized; } export interface AppEvalCase { @@ -51,7 +79,7 @@ export function appEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) { const rig = new AppRig({ configOverrides: { model: DEFAULT_GEMINI_MODEL, - ...evalCase.configOverrides, + ...sanitizeConfigOverrides(evalCase.configOverrides), }, }); diff --git a/evals/cli_help_delegation.eval.ts b/evals/cli_help_delegation.eval.ts index 8be3bf1c51c..09b57e0c3e5 100644 --- a/evals/cli_help_delegation.eval.ts +++ b/evals/cli_help_delegation.eval.ts @@ -11,7 +11,8 @@ describe('CliHelpAgent Delegation', () => { }, }, }, - prompt: 'Help me create a subagent in this project', + // Refined prompt for clearer delegation intent + prompt: 'How do I create and configure a subagent using the CLI in this project?', timeout: 60000, assert: async (rig, _result) => { const toolLogs = rig.readToolLogs(); diff --git a/evals/generalist_delegation.eval.ts b/evals/generalist_delegation.eval.ts index 81252880eba..b9d1b8a9403 100644 --- a/evals/generalist_delegation.eval.ts +++ b/evals/generalist_delegation.eval.ts @@ -7,6 +7,19 @@ import { describe, expect } from 'vitest'; import { appEvalTest } from './app-test-helper.js'; +/** + * Compile-time guard to prevent tool restriction usage in eval configs + */ +type ForbiddenToolKeys = + | 'excludeTools' + | 'coreTools' + | 'allowedTools' + | 'mainAgentTools'; + +type NoToolRestrictions = T & { + [K in ForbiddenToolKeys]?: never; +}; + describe('generalist_delegation', () => { // --- Positive Evals (Should Delegate) --- @@ -21,7 +34,7 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - }, + } as NoToolRestrictions, files: { 'file1.ts': 'console.log("no semi")', 'file2.ts': 'console.log("no semi")', @@ -64,7 +77,7 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - }, + } as NoToolRestrictions, files: { 'src/a.ts': 'export const a = 1;', 'src/b.ts': 'export const b = 2;', @@ -104,7 +117,7 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - }, + } as NoToolRestrictions, files: { 'README.md': 'This is a proyect.', }, @@ -138,7 +151,7 @@ describe('generalist_delegation', () => { experimental: { enableAgents: true, }, - }, + } as NoToolRestrictions, files: { 'src/VERSION': '1.2.3', }, diff --git a/evals/grep_search_functionality.eval.ts b/evals/grep_search_functionality.eval.ts index f1224b8221f..33abdb017b9 100644 --- a/evals/grep_search_functionality.eval.ts +++ b/evals/grep_search_functionality.eval.ts @@ -11,6 +11,20 @@ import { checkModelOutputContent, } from './test-helper.js'; +function parseToolArgs(args: unknown): Record | null { + if (args && typeof args === 'object') { + return args as Record; + } + if (typeof args === 'string') { + try { + return JSON.parse(args) as Record; + } catch { + return null; + } + } + return null; +} + describe('grep_search_functionality', () => { const TEST_PREFIX = 'Grep Search Functionality: '; @@ -44,8 +58,8 @@ describe('grep_search_functionality', () => { 'grep_search', undefined, (args) => { - const params = JSON.parse(args); - return params.case_sensitive === true; + const params = parseToolArgs(args); + return params?.case_sensitive === true; }, ); expect( @@ -74,8 +88,8 @@ describe('grep_search_functionality', () => { 'grep_search', undefined, (args) => { - const params = JSON.parse(args); - return params.names_only === true; + const params = parseToolArgs(args); + return params?.names_only === true; }, ); expect( @@ -104,8 +118,8 @@ describe('grep_search_functionality', () => { 'grep_search', undefined, (args) => { - const params = JSON.parse(args); - return params.include_pattern === '*.js'; + const params = parseToolArgs(args); + return params?.include_pattern === '*.js'; }, ); expect( @@ -134,8 +148,8 @@ describe('grep_search_functionality', () => { 'grep_search', undefined, (args) => { - const params = JSON.parse(args); - return params.dir_path === 'src'; + const params = parseToolArgs(args); + return params?.dir_path === 'src'; }, ); expect( diff --git a/evals/model_steering.eval.ts b/evals/model_steering.eval.ts index 2cb87edcc22..550d294d507 100644 --- a/evals/model_steering.eval.ts +++ b/evals/model_steering.eval.ts @@ -11,12 +11,25 @@ import fs from 'node:fs'; import { appEvalTest } from './app-test-helper.js'; import { PolicyDecision } from '@google/gemini-cli-core'; +/** + * Compile-time guard to prevent tool restriction usage in eval configs + */ +type ForbiddenToolKeys = + | 'excludeTools' + | 'coreTools' + | 'allowedTools' + | 'mainAgentTools'; + +type NoToolRestrictions = T & { + [K in ForbiddenToolKeys]?: never; +}; + describe('Model Steering Behavioral Evals', () => { appEvalTest('USUALLY_PASSES', { name: 'Corrective Hint: Model switches task based on hint during tool turn', configOverrides: { modelSteering: true, - }, + } as NoToolRestrictions>, files: { 'README.md': '# Gemini CLI\nThis is a tool for developers.\nLicense: Apache-2.0\nLine 4\nLine 5\nLine 6', @@ -55,7 +68,7 @@ describe('Model Steering Behavioral Evals', () => { name: 'Suggestive Hint: Model incorporates user guidance mid-stream', configOverrides: { modelSteering: true, - }, + } as NoToolRestrictions>, files: {}, prompt: 'Create a file called "hw.js" with a JS hello world.', setup: async (rig) => { diff --git a/evals/save_memory.eval.ts b/evals/save_memory.eval.ts index 25e081a819f..51f47887176 100644 --- a/evals/save_memory.eval.ts +++ b/evals/save_memory.eval.ts @@ -11,9 +11,23 @@ import { checkModelOutputContent, } from '../integration-tests/test-helper.js'; +/** + * Compile-time guard to prevent tool restriction usage in eval configs + */ +type ForbiddenToolKeys = + | 'excludeTools' + | 'coreTools' + | 'allowedTools' + | 'mainAgentTools'; + +type NoToolRestrictions = T & { + [K in ForbiddenToolKeys]?: never; +}; + describe('save_memory', () => { const TEST_PREFIX = 'Save memory test: '; const rememberingFavoriteColor = "Agent remembers user's favorite color"; + evalTest('ALWAYS_PASSES', { name: rememberingFavoriteColor, @@ -33,7 +47,9 @@ describe('save_memory', () => { }); }, }); + const rememberingCommandRestrictions = 'Agent remembers command restrictions'; + evalTest('USUALLY_PASSES', { name: rememberingCommandRestrictions, @@ -53,6 +69,7 @@ describe('save_memory', () => { }); const rememberingWorkflow = 'Agent remembers workflow preferences'; + evalTest('USUALLY_PASSES', { name: rememberingWorkflow, @@ -73,6 +90,7 @@ describe('save_memory', () => { const ignoringTemporaryInformation = 'Agent ignores temporary conversation details'; + evalTest('ALWAYS_PASSES', { name: ignoringTemporaryInformation, @@ -82,6 +100,7 @@ describe('save_memory', () => { const wasToolCalled = rig .readToolLogs() .some((log) => log.toolRequest.name === 'save_memory'); + expect( wasToolCalled, 'save_memory should not be called for temporary information', @@ -96,6 +115,7 @@ describe('save_memory', () => { }); const rememberingPetName = "Agent remembers user's pet's name"; + evalTest('ALWAYS_PASSES', { name: rememberingPetName, @@ -115,6 +135,7 @@ describe('save_memory', () => { }); const rememberingCommandAlias = 'Agent remembers custom command aliases'; + evalTest('ALWAYS_PASSES', { name: rememberingCommandAlias, @@ -135,6 +156,7 @@ describe('save_memory', () => { const ignoringDbSchemaLocation = "Agent ignores workspace's database schema location"; + evalTest('USUALLY_PASSES', { name: ignoringDbSchemaLocation, prompt: `The database schema for this workspace is located in \`db/schema.sql\`.`, @@ -143,6 +165,7 @@ describe('save_memory', () => { const wasToolCalled = rig .readToolLogs() .some((log) => log.toolRequest.name === 'save_memory'); + expect( wasToolCalled, 'save_memory should not be called for workspace-specific information', @@ -154,6 +177,7 @@ describe('save_memory', () => { const rememberingCodingStyle = "Agent remembers user's coding style preference"; + evalTest('ALWAYS_PASSES', { name: rememberingCodingStyle, @@ -174,6 +198,7 @@ describe('save_memory', () => { const ignoringBuildArtifactLocation = 'Agent ignores workspace build artifact location'; + evalTest('USUALLY_PASSES', { name: ignoringBuildArtifactLocation, prompt: `In this workspace, build artifacts are stored in the \`dist/artifacts\` directory.`, @@ -182,6 +207,7 @@ describe('save_memory', () => { const wasToolCalled = rig .readToolLogs() .some((log) => log.toolRequest.name === 'save_memory'); + expect( wasToolCalled, 'save_memory should not be called for workspace-specific information', @@ -191,7 +217,9 @@ describe('save_memory', () => { }, }); - const ignoringMainEntryPoint = "Agent ignores workspace's main entry point"; + const ignoringMainEntryPoint = + "Agent ignores workspace's main entry point"; + evalTest('USUALLY_PASSES', { name: ignoringMainEntryPoint, prompt: `The main entry point for this workspace is \`src/index.js\`.`, @@ -200,6 +228,7 @@ describe('save_memory', () => { const wasToolCalled = rig .readToolLogs() .some((log) => log.toolRequest.name === 'save_memory'); + expect( wasToolCalled, 'save_memory should not be called for workspace-specific information', @@ -210,6 +239,7 @@ describe('save_memory', () => { }); const rememberingBirthday = "Agent remembers user's birthday"; + evalTest('ALWAYS_PASSES', { name: rememberingBirthday, diff --git a/evals/validation_fidelity.eval.ts b/evals/validation_fidelity.eval.ts index 8cfb4f6626e..1dcda7a49e2 100644 --- a/evals/validation_fidelity.eval.ts +++ b/evals/validation_fidelity.eval.ts @@ -7,6 +7,20 @@ import { describe, expect } from 'vitest'; import { evalTest } from './test-helper.js'; +function parseToolArgs(args: unknown): Record | null { + if (args && typeof args === 'object') { + return args as Record; + } + if (typeof args === 'string') { + try { + return JSON.parse(args) as Record; + } catch { + return null; + } + } + return null; +} + describe('validation_fidelity', () => { evalTest('USUALLY_PASSES', { name: 'should perform exhaustive validation autonomously when guided by system instructions', @@ -67,7 +81,9 @@ test('formats log correctly', () => { ); const hasBuildOrTsc = shellCalls.some((log) => { - const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase(); + const args = parseToolArgs(log.toolRequest.args); + const cmd = + typeof args?.command === 'string' ? args.command.toLowerCase() : ''; return ( cmd.includes('npm run build') || cmd.includes('tsc') || diff --git a/evals/validation_fidelity_pre_existing_errors.eval.ts b/evals/validation_fidelity_pre_existing_errors.eval.ts index 4990b7bc918..35e69e40bf5 100644 --- a/evals/validation_fidelity_pre_existing_errors.eval.ts +++ b/evals/validation_fidelity_pre_existing_errors.eval.ts @@ -7,6 +7,20 @@ import { describe, expect } from 'vitest'; import { evalTest } from './test-helper.js'; +function parseToolArgs(args: unknown): Record | null { + if (args && typeof args === 'object') { + return args as Record; + } + if (typeof args === 'string') { + try { + return JSON.parse(args) as Record; + } catch { + return null; + } + } + return null; +} + describe('validation_fidelity_pre_existing_errors', () => { evalTest('USUALLY_PASSES', { name: 'should handle pre-existing project errors gracefully during validation', @@ -55,9 +69,14 @@ export function multiply(a: number, b: number): number { // Verify it did the work in math.ts const mathRefactor = replaceCalls.some((log) => { - const args = JSON.parse(log.toolRequest.args); + const args = parseToolArgs(log.toolRequest.args); + if (!args) { + return false; + } return ( + typeof args.file_path === 'string' && args.file_path.endsWith('src/math.ts') && + typeof args.new_string === 'string' && args.new_string.includes('sum') ); }); @@ -67,7 +86,9 @@ export function multiply(a: number, b: number): number { (log) => log.toolRequest.name === 'run_shell_command', ); const ranValidation = shellCalls.some((log) => { - const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase(); + const args = parseToolArgs(log.toolRequest.args); + const cmd = + typeof args?.command === 'string' ? args.command.toLowerCase() : ''; return cmd.includes('build') || cmd.includes('tsc'); });