Skip to content
223 changes: 216 additions & 7 deletions evals/save_memory.eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,234 @@

import { describe, expect } from 'vitest';
import { evalTest } from './test-helper.js';
import { validateModelOutput } from '../integration-tests/test-helper.js';
import {
assertModelHasOutput,
checkModelOutputContent,
} from '../integration-tests/test-helper.js';

describe('save_memory', () => {
const TEST_PREFIX = 'Save memory test: ';
const rememberingFavoriteColor = "Agent remembers user's favorite color";
evalTest('ALWAYS_PASSES', {
name: 'should be able to save to memory',
name: rememberingFavoriteColor,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `remember that my favorite color is blue.

what is my favorite color? tell me that and surround it with $ symbol`,
assert: async (rig, result) => {
const foundToolCall = await rig.waitForToolCall('save_memory');
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: 'blue',
testName: `${TEST_PREFIX}${rememberingFavoriteColor}`,
});
},
});
const rememberingCommandRestrictions = 'Agent remembers command restrictions';
evalTest('ALWAYS_PASSES', {
name: rememberingCommandRestrictions,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `I don't want you to ever run npm commands.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/not run npm commands|remember|ok/i],
testName: `${TEST_PREFIX}${rememberingCommandRestrictions}`,
});
},
});

const rememberingWorkflow = 'Agent remembers workflow preferences';
evalTest('ALWAYS_PASSES', {
name: rememberingWorkflow,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `I want you to always lint after building.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/always|ok|remember|will do/i],
testName: `${TEST_PREFIX}${rememberingWorkflow}`,
});
},
});

const ignoringTemporaryInformation =
'Agent ignores temporary conversation details';
evalTest('ALWAYS_PASSES', {
name: ignoringTemporaryInformation,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `I'm going to get a coffee.`,
assert: async (rig, result) => {
await rig.waitForTelemetryReady();
const wasToolCalled = rig
.readToolLogs()
.some((log) => log.toolRequest.name === 'save_memory');
expect(
foundToolCall,
'Expected to find a save_memory tool call',
).toBeTruthy();
wasToolCalled,
'save_memory should not be called for temporary information',
).toBe(false);

assertModelHasOutput(result);
checkModelOutputContent(result, {
testName: `${TEST_PREFIX}${ignoringTemporaryInformation}`,
forbiddenContent: [/remember|will do/i],
});
},
});

const rememberingPetName = "Agent remembers user's pet's name";
evalTest('ALWAYS_PASSES', {
name: rememberingPetName,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `My dog's name is Buddy. What is my dog's name?`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/Buddy/i],
testName: `${TEST_PREFIX}${rememberingPetName}`,
});
},
});

const rememberingCommandAlias = 'Agent remembers custom command aliases';
evalTest('ALWAYS_PASSES', {
name: rememberingCommandAlias,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `When I say 'start server', you should run 'npm run dev'.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/npm run dev|start server|ok|remember|will do/i],
testName: `${TEST_PREFIX}${rememberingCommandAlias}`,
});
},
});

const rememberingDbSchemaLocation =
"Agent remembers project's database schema location";
evalTest('ALWAYS_PASSES', {
name: rememberingDbSchemaLocation,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `The database schema for this project is located in \`db/schema.sql\`.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/database schema|ok|remember|will do/i],
testName: `${TEST_PREFIX}${rememberingDbSchemaLocation}`,
});
},
});

const rememberingCodingStyle =
"Agent remembers user's coding style preference";
evalTest('ALWAYS_PASSES', {
name: rememberingCodingStyle,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `I prefer to use tabs instead of spaces for indentation.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [/tabs instead of spaces|ok|remember|will do/i],
testName: `${TEST_PREFIX}${rememberingCodingStyle}`,
});
},
});

const rememberingTestCommand =
'Agent remembers specific project test command';
evalTest('ALWAYS_PASSES', {
name: rememberingTestCommand,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `The command to run all backend tests is \`npm run test:backend\`.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [
/command to run all backend tests|ok|remember|will do/i,
],
testName: `${TEST_PREFIX}${rememberingTestCommand}`,
});
},
});

const rememberingMainEntryPoint =
"Agent remembers project's main entry point";
evalTest('ALWAYS_PASSES', {
name: rememberingMainEntryPoint,
params: {
settings: { tools: { core: ['save_memory'] } },
},
prompt: `The main entry point for this project is \`src/index.js\`.`,
assert: async (rig, result) => {
const wasToolCalled = await rig.waitForToolCall('save_memory');
expect(wasToolCalled, 'Expected save_memory tool to be called').toBe(
true,
);

validateModelOutput(result, 'blue', 'Save memory test');
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: [
/main entry point for this project|ok|remember|will do/i,
],
testName: `${TEST_PREFIX}${rememberingMainEntryPoint}`,
});
},
});
});
18 changes: 13 additions & 5 deletions integration-tests/file-system.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { existsSync } from 'node:fs';
import * as path from 'node:path';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
import {
TestRig,
printDebugInfo,
assertModelHasOutput,
checkModelOutputContent,
} from './test-helper.js';

describe('file-system', () => {
let rig: TestRig;
Expand Down Expand Up @@ -43,8 +48,11 @@ describe('file-system', () => {
'Expected to find a read_file tool call',
).toBeTruthy();

// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'hello world', 'File read test');
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: 'hello world',
testName: 'File read test',
});
});

it('should be able to write a file', async () => {
Expand Down Expand Up @@ -74,8 +82,8 @@ describe('file-system', () => {
'Expected to find a write_file, edit, or replace tool call',
).toBeTruthy();

// Validate model output - will throw if no output
validateModelOutput(result, null, 'File write test');
assertModelHasOutput(result);
checkModelOutputContent(result, { testName: 'File write test' });

const fileContent = rig.readFile('test.txt');

Expand Down
18 changes: 11 additions & 7 deletions integration-tests/google_web_search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@

import { WEB_SEARCH_TOOL_NAME } from '../packages/core/src/tools/tool-names.js';
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
import {
TestRig,
printDebugInfo,
assertModelHasOutput,
checkModelOutputContent,
} from './test-helper.js';

describe('web search tool', () => {
let rig: TestRig;
Expand Down Expand Up @@ -68,12 +73,11 @@ describe('web search tool', () => {
`Expected to find a call to ${WEB_SEARCH_TOOL_NAME}`,
).toBeTruthy();

// Validate model output - will throw if no output, warn if missing expected content
const hasExpectedContent = validateModelOutput(
result,
['weather', 'london'],
'Google web search test',
);
assertModelHasOutput(result);
const hasExpectedContent = checkModelOutputContent(result, {
expectedContent: ['weather', 'london'],
testName: 'Google web search test',
});

// If content was missing, log the search queries used
if (!hasExpectedContent) {
Expand Down
10 changes: 7 additions & 3 deletions integration-tests/list_directory.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import {
TestRig,
poll,
printDebugInfo,
validateModelOutput,
assertModelHasOutput,
checkModelOutputContent,
} from './test-helper.js';
import { existsSync } from 'node:fs';
import { join } from 'node:path';
Expand Down Expand Up @@ -68,7 +69,10 @@ describe('list_directory', () => {
throw e;
}

// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, ['file1.txt', 'subdir'], 'List directory test');
assertModelHasOutput(result);
checkModelOutputContent(result, {
expectedContent: ['file1.txt', 'subdir'],
testName: 'List directory test',
});
});
});
11 changes: 8 additions & 3 deletions integration-tests/read_many_files.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
*/

import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
import {
TestRig,
printDebugInfo,
assertModelHasOutput,
checkModelOutputContent,
} from './test-helper.js';

describe('read_many_files', () => {
let rig: TestRig;
Expand Down Expand Up @@ -50,7 +55,7 @@ describe('read_many_files', () => {
'Expected to find either read_many_files or multiple read_file tool calls',
).toBeTruthy();

// Validate model output - will throw if no output
validateModelOutput(result, null, 'Read many files test');
assertModelHasOutput(result);
checkModelOutputContent(result, { testName: 'Read many files test' });
});
});
Loading
Loading