-
Notifications
You must be signed in to change notification settings - Fork 13k
Add behavioral evals for tracker #20069
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
30 commits
Select commit
Hold shift + click to select a range
b5db5d3
feat(core): implement task tracker foundation and service (Phase 1)
anj-s f737126
feat(core,cli): implement task tracker tools and feature flag (Phase 2)
anj-s 448af0a
chore(core): improve ID generation and add runtime task validation
anj-s 2d6ee8f
fix: address code review comments from bot in trackerService.ts
anj-s 9618f8f
Merge branch 'u/anj/task-tracker-phase-1' into u/anj/task-tracker-pha…
anj-s 6a24077
docs: update implementation plan for Phase 2
anj-s ae96477
feat(tracker): move tracker storage to project temp directory
anj-s 91e7881
Merge branch 'u/anj/task-tracker-phase-1' into u/anj/task-tracker-pha…
anj-s e6b68e7
feat(tracker): integrate dynamic storage path in Config and tools
anj-s ba73124
feat(tracker): simplify tracker storage path
anj-s e32d8a2
Merge branch 'u/anj/task-tracker-phase-1' into u/anj/task-tracker-pha…
anj-s 1e743ea
feat(tracker): update config to use simplified tracker path
anj-s 23194dd
feat(tracker): restore session-specific nested storage path
anj-s de0ce2c
Merge branch 'u/anj/task-tracker-phase-1' into u/anj/task-tracker-pha…
anj-s eed1ca6
feat(tracker): restore nested tracker path in Config
anj-s eb33db5
feat(tracker): simplify tracker storage path and flatten directory st…
anj-s daf9da2
fix(tracker): lazily initialize tracker directory
anj-s 41cf395
chore(tracker): remove plans configuration directory from git tracking
anj-s fc298e3
remove .gitignore changes
anj-s 30fe8ff
remove .gitignore changes
anj-s 5d31761
si changes: task tracker prep implementation
anj-s 53755c5
si changes
anj-s b53aa80
test: add explicit and implicit behavioral evals for tracker
anj-s 1a345a2
behavioral evals for tracker
anj-s 441ab08
update behavioral evals
anj-s c03d3db
merge
anj-s 8ac7fac
Merge branch 'main' into anj/tracker-evals
anj-s f50b80b
address review comments
anj-s b10223b
addressed comments
anj-s b5b62f5
Merge branch 'main' into anj/tracker-evals
anj-s File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| /** | ||
| * @license | ||
| * Copyright 2026 Google LLC | ||
| * SPDX-License-Identifier: Apache-2.0 | ||
| */ | ||
|
|
||
| import { describe, expect } from 'vitest'; | ||
| import { | ||
| TRACKER_CREATE_TASK_TOOL_NAME, | ||
| TRACKER_UPDATE_TASK_TOOL_NAME, | ||
| } from '@google/gemini-cli-core'; | ||
| import { evalTest, assertModelHasOutput } from './test-helper.js'; | ||
| import fs from 'node:fs'; | ||
| import path from 'node:path'; | ||
|
|
||
| const FILES = { | ||
| 'package.json': JSON.stringify({ | ||
| name: 'test-project', | ||
| version: '1.0.0', | ||
| scripts: { test: 'echo "All tests passed!"' }, | ||
| }), | ||
| 'src/login.js': | ||
| 'function login(username, password) {\n if (!username) throw new Error("Missing username");\n // BUG: missing password check\n return true;\n}', | ||
| } as const; | ||
|
|
||
| describe('tracker_mode', () => { | ||
| evalTest('USUALLY_PASSES', { | ||
| name: 'should manage tasks in the tracker when explicitly requested during a bug fix', | ||
| params: { | ||
| settings: { experimental: { taskTracker: true } }, | ||
| }, | ||
| files: FILES, | ||
| prompt: | ||
| 'We have a bug in src/login.js: the password check is missing. First, create a task in the tracker to fix it. Then fix the bug, and mark the task as closed.', | ||
| assert: async (rig, result) => { | ||
| const wasCreateCalled = await rig.waitForToolCall( | ||
| TRACKER_CREATE_TASK_TOOL_NAME, | ||
| ); | ||
| expect( | ||
| wasCreateCalled, | ||
| 'Expected tracker_create_task tool to be called', | ||
| ).toBe(true); | ||
|
|
||
| const toolLogs = rig.readToolLogs(); | ||
| const createCall = toolLogs.find( | ||
| (log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME, | ||
| ); | ||
| expect(createCall).toBeDefined(); | ||
| const args = JSON.parse(createCall!.toolRequest.args); | ||
| expect( | ||
| (args.title?.toLowerCase() ?? '') + | ||
| (args.description?.toLowerCase() ?? ''), | ||
| ).toContain('login'); | ||
|
|
||
| const wasUpdateCalled = await rig.waitForToolCall( | ||
| TRACKER_UPDATE_TASK_TOOL_NAME, | ||
| ); | ||
| expect( | ||
| wasUpdateCalled, | ||
| 'Expected tracker_update_task tool to be called', | ||
| ).toBe(true); | ||
|
|
||
| const updateCall = toolLogs.find( | ||
| (log) => log.toolRequest.name === TRACKER_UPDATE_TASK_TOOL_NAME, | ||
| ); | ||
| expect(updateCall).toBeDefined(); | ||
| const updateArgs = JSON.parse(updateCall!.toolRequest.args); | ||
| expect(updateArgs.status).toBe('closed'); | ||
|
|
||
| const loginContent = fs.readFileSync( | ||
| path.join(rig.testDir!, 'src/login.js'), | ||
| 'utf-8', | ||
| ); | ||
| expect(loginContent).not.toContain('// BUG: missing password check'); | ||
|
|
||
| assertModelHasOutput(result); | ||
| }, | ||
| }); | ||
|
|
||
| evalTest('USUALLY_PASSES', { | ||
| name: 'should implicitly create tasks when asked to build a feature plan', | ||
| params: { | ||
| settings: { experimental: { taskTracker: true } }, | ||
| }, | ||
| files: FILES, | ||
| prompt: | ||
anj-s marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| 'I need to build a complex new feature for user authentication in our project. Create a detailed implementation plan and organize the work into bite-sized chunks. Do not actually implement the code yet, just plan it.', | ||
| assert: async (rig, result) => { | ||
| // The model should proactively use tracker_create_task to organize the work | ||
| const wasToolCalled = await rig.waitForToolCall( | ||
| TRACKER_CREATE_TASK_TOOL_NAME, | ||
| ); | ||
| expect( | ||
| wasToolCalled, | ||
| 'Expected tracker_create_task to be called implicitly to organize plan', | ||
| ).toBe(true); | ||
|
|
||
| const toolLogs = rig.readToolLogs(); | ||
| const createCalls = toolLogs.filter( | ||
| (log) => log.toolRequest.name === TRACKER_CREATE_TASK_TOOL_NAME, | ||
| ); | ||
|
|
||
| // We expect it to create at least one task for authentication, likely more. | ||
| expect(createCalls.length).toBeGreaterThan(0); | ||
|
|
||
| // Verify it didn't write any code since we asked it to just plan | ||
| const loginContent = fs.readFileSync( | ||
| path.join(rig.testDir!, 'src/login.js'), | ||
| 'utf-8', | ||
| ); | ||
| expect(loginContent).toContain('// BUG: missing password check'); | ||
|
|
||
| assertModelHasOutput(result); | ||
| }, | ||
| }); | ||
| }); | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.