Skip to content

Commit d9d2ce3

Browse files
authored
test(evals): add comprehensive subagent delegation evaluations (#24132)
1 parent da8c841 commit d9d2ce3

File tree

3 files changed

+202
-19
lines changed

3 files changed

+202
-19
lines changed

evals/subagents.eval.ts

Lines changed: 117 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,21 @@ import { evalTest, TEST_AGENTS } from './test-helper.js';
1313

1414
const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
1515

16+
// A minimal package.json is used to provide a realistic workspace anchor.
17+
// This prevents the agent from making incorrect assumptions about the environment
18+
// and helps it properly navigate or act as if it is in a standard Node.js project.
19+
const MOCK_PACKAGE_JSON = JSON.stringify(
20+
{
21+
name: 'subagent-eval-project',
22+
version: '1.0.0',
23+
type: 'module',
24+
},
25+
null,
26+
2,
27+
);
28+
1629
function readProjectFile(
17-
rig: { testDir?: string },
30+
rig: { testDir: string | null },
1831
relativePath: string,
1932
): string {
2033
return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
@@ -117,15 +130,7 @@ describe('subagent eval test cases', () => {
117130
files: {
118131
...TEST_AGENTS.TESTING_AGENT.asFile(),
119132
'index.ts': INDEX_TS,
120-
'package.json': JSON.stringify(
121-
{
122-
name: 'subagent-eval-project',
123-
version: '1.0.0',
124-
type: 'module',
125-
},
126-
null,
127-
2,
128-
),
133+
'package.json': MOCK_PACKAGE_JSON,
129134
},
130135
assert: async (rig, _result) => {
131136
const toolLogs = rig.readToolLogs() as Array<{
@@ -164,15 +169,7 @@ describe('subagent eval test cases', () => {
164169
...TEST_AGENTS.TESTING_AGENT.asFile(),
165170
'index.ts': INDEX_TS,
166171
'README.md': 'TODO: update the README.\n',
167-
'package.json': JSON.stringify(
168-
{
169-
name: 'subagent-eval-project',
170-
version: '1.0.0',
171-
type: 'module',
172-
},
173-
null,
174-
2,
175-
),
172+
'package.json': MOCK_PACKAGE_JSON,
176173
},
177174
assert: async (rig, _result) => {
178175
const toolLogs = rig.readToolLogs() as Array<{
@@ -190,4 +187,105 @@ describe('subagent eval test cases', () => {
190187
);
191188
},
192189
});
190+
191+
/**
192+
* Checks that the main agent can correctly select the appropriate subagent
193+
* from a large pool of available subagents (10 total).
194+
*/
195+
evalTest('USUALLY_PASSES', {
196+
name: 'should select the correct subagent from a pool of 10 different agents',
197+
prompt: 'Please add a new SQL table migration for a user profile.',
198+
files: {
199+
...TEST_AGENTS.DOCS_AGENT.asFile(),
200+
...TEST_AGENTS.TESTING_AGENT.asFile(),
201+
...TEST_AGENTS.DATABASE_AGENT.asFile(),
202+
...TEST_AGENTS.CSS_AGENT.asFile(),
203+
...TEST_AGENTS.I18N_AGENT.asFile(),
204+
...TEST_AGENTS.SECURITY_AGENT.asFile(),
205+
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
206+
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
207+
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
208+
...TEST_AGENTS.MOBILE_AGENT.asFile(),
209+
'package.json': MOCK_PACKAGE_JSON,
210+
},
211+
assert: async (rig, _result) => {
212+
const toolLogs = rig.readToolLogs() as Array<{
213+
toolRequest: { name: string };
214+
}>;
215+
await rig.expectToolCallSuccess(['database-agent']);
216+
217+
// Ensure the generalist and other irrelevant specialists were not invoked
218+
const uncalledAgents = [
219+
'generalist',
220+
TEST_AGENTS.DOCS_AGENT.name,
221+
TEST_AGENTS.TESTING_AGENT.name,
222+
TEST_AGENTS.CSS_AGENT.name,
223+
TEST_AGENTS.I18N_AGENT.name,
224+
TEST_AGENTS.SECURITY_AGENT.name,
225+
TEST_AGENTS.DEVOPS_AGENT.name,
226+
TEST_AGENTS.ANALYTICS_AGENT.name,
227+
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
228+
TEST_AGENTS.MOBILE_AGENT.name,
229+
];
230+
231+
for (const agentName of uncalledAgents) {
232+
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
233+
false,
234+
);
235+
}
236+
},
237+
});
238+
239+
/**
240+
* Checks that the main agent can correctly select the appropriate subagent
241+
* from a large pool of available subagents, even when many irrelevant MCP tools are present.
242+
*
243+
* This test includes stress tests the subagent delegation with ~80 tools.
244+
*/
245+
evalTest('USUALLY_PASSES', {
246+
name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
247+
prompt: 'Please add a new SQL table migration for a user profile.',
248+
setup: async (rig) => {
249+
rig.addTestMcpServer('workspace-server', 'google-workspace');
250+
},
251+
files: {
252+
...TEST_AGENTS.DOCS_AGENT.asFile(),
253+
...TEST_AGENTS.TESTING_AGENT.asFile(),
254+
...TEST_AGENTS.DATABASE_AGENT.asFile(),
255+
...TEST_AGENTS.CSS_AGENT.asFile(),
256+
...TEST_AGENTS.I18N_AGENT.asFile(),
257+
...TEST_AGENTS.SECURITY_AGENT.asFile(),
258+
...TEST_AGENTS.DEVOPS_AGENT.asFile(),
259+
...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
260+
...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
261+
...TEST_AGENTS.MOBILE_AGENT.asFile(),
262+
'package.json': MOCK_PACKAGE_JSON,
263+
},
264+
assert: async (rig, _result) => {
265+
const toolLogs = rig.readToolLogs() as Array<{
266+
toolRequest: { name: string };
267+
}>;
268+
await rig.expectToolCallSuccess(['database-agent']);
269+
270+
// Ensure the generalist and other irrelevant specialists were not invoked
271+
const uncalledAgents = [
272+
'generalist',
273+
TEST_AGENTS.DOCS_AGENT.name,
274+
TEST_AGENTS.TESTING_AGENT.name,
275+
TEST_AGENTS.CSS_AGENT.name,
276+
TEST_AGENTS.I18N_AGENT.name,
277+
TEST_AGENTS.SECURITY_AGENT.name,
278+
TEST_AGENTS.DEVOPS_AGENT.name,
279+
TEST_AGENTS.ANALYTICS_AGENT.name,
280+
TEST_AGENTS.ACCESSIBILITY_AGENT.name,
281+
TEST_AGENTS.MOBILE_AGENT.name,
282+
];
283+
284+
for (const agentName of uncalledAgents) {
285+
expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
286+
false,
287+
);
288+
}
289+
},
290+
});
193291
});

evals/test-helper.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ export async function internalEvalTest(evalCase: EvalCase) {
6161
try {
6262
rig.setup(evalCase.name, evalCase.params);
6363

64+
if (evalCase.setup) {
65+
await evalCase.setup(rig);
66+
}
67+
6468
if (evalCase.files) {
6569
await setupTestFiles(rig, evalCase.files);
6670
}
@@ -371,6 +375,7 @@ export interface EvalCase {
371375
prompt: string;
372376
timeout?: number;
373377
files?: Record<string, string>;
378+
setup?: (rig: TestRig) => Promise<void> | void;
374379
/** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
375380
messages?: Record<string, unknown>[];
376381
/** Session ID for the resumed session. Auto-generated if not provided. */

packages/test-utils/src/fixtures/agents.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,84 @@ export const TEST_AGENTS = {
6969
tools: ['read_file', 'write_file'],
7070
body: 'You are the test agent. Add or update tests.',
7171
}),
72+
/**
73+
* An agent with expertise in database schemas, SQL, and creating database migrations.
74+
*/
75+
DATABASE_AGENT: createAgent({
76+
name: 'database-agent',
77+
description:
78+
'An expert in database schemas, SQL, and creating database migrations.',
79+
tools: ['read_file', 'write_file'],
80+
body: 'You are the database agent. Create and update SQL migrations.',
81+
}),
82+
83+
/**
84+
* An agent with expertise in CSS, styling, and UI design.
85+
*/
86+
CSS_AGENT: createAgent({
87+
name: 'css-agent',
88+
description: 'An expert in CSS, styling, and UI design.',
89+
tools: ['read_file', 'write_file'],
90+
body: 'You are the CSS agent.',
91+
}),
92+
93+
/**
94+
* An agent with expertise in internationalization and translations.
95+
*/
96+
I18N_AGENT: createAgent({
97+
name: 'i18n-agent',
98+
description: 'An expert in internationalization and translations.',
99+
tools: ['read_file', 'write_file'],
100+
body: 'You are the i18n agent.',
101+
}),
102+
103+
/**
104+
* An agent with expertise in security audits and vulnerability patches.
105+
*/
106+
SECURITY_AGENT: createAgent({
107+
name: 'security-agent',
108+
description: 'An expert in security audits and vulnerability patches.',
109+
tools: ['read_file', 'write_file'],
110+
body: 'You are the security agent.',
111+
}),
112+
113+
/**
114+
* An agent with expertise in CI/CD, Docker, and deployment scripts.
115+
*/
116+
DEVOPS_AGENT: createAgent({
117+
name: 'devops-agent',
118+
description: 'An expert in CI/CD, Docker, and deployment scripts.',
119+
tools: ['read_file', 'write_file'],
120+
body: 'You are the devops agent.',
121+
}),
122+
123+
/**
124+
* An agent with expertise in tracking, analytics, and metrics.
125+
*/
126+
ANALYTICS_AGENT: createAgent({
127+
name: 'analytics-agent',
128+
description: 'An expert in tracking, analytics, and metrics.',
129+
tools: ['read_file', 'write_file'],
130+
body: 'You are the analytics agent.',
131+
}),
132+
133+
/**
134+
* An agent with expertise in web accessibility and ARIA roles.
135+
*/
136+
ACCESSIBILITY_AGENT: createAgent({
137+
name: 'accessibility-agent',
138+
description: 'An expert in web accessibility and ARIA roles.',
139+
tools: ['read_file', 'write_file'],
140+
body: 'You are the accessibility agent.',
141+
}),
142+
143+
/**
144+
* An agent with expertise in React Native and mobile app development.
145+
*/
146+
MOBILE_AGENT: createAgent({
147+
name: 'mobile-agent',
148+
description: 'An expert in React Native and mobile app development.',
149+
tools: ['read_file', 'write_file'],
150+
body: 'You are the mobile agent.',
151+
}),
72152
} as const;

0 commit comments

Comments
 (0)