test(evals): add comprehensive subagent delegation evaluations (#24132)

abhipatel12 · web-flow · commit d9d2ce36f2a7 · 2026-03-29T23:13:50.000Z
diff --git a/evals/subagents.eval.ts b/evals/subagents.eval.ts
@@ -13,8 +13,21 @@ import { evalTest, TEST_AGENTS } from './test-helper.js';
 
 const INDEX_TS = 'export const add = (a: number, b: number) => a + b;\n';
 
+// A minimal package.json is used to provide a realistic workspace anchor.
+// This prevents the agent from making incorrect assumptions about the environment
+// and helps it properly navigate or act as if it is in a standard Node.js project.
+const MOCK_PACKAGE_JSON = JSON.stringify(
+  {
+    name: 'subagent-eval-project',
+    version: '1.0.0',
+    type: 'module',
+  },
+  null,
+  2,
+);
+
 function readProjectFile(
-  rig: { testDir?: string },
+  rig: { testDir: string | null },
   relativePath: string,
 ): string {
   return fs.readFileSync(path.join(rig.testDir!, relativePath), 'utf8');
@@ -117,15 +130,7 @@ describe('subagent eval test cases', () => {
     files: {
       ...TEST_AGENTS.TESTING_AGENT.asFile(),
       'index.ts': INDEX_TS,
-      'package.json': JSON.stringify(
-        {
-          name: 'subagent-eval-project',
-          version: '1.0.0',
-          type: 'module',
-        },
-        null,
-        2,
-      ),
+      'package.json': MOCK_PACKAGE_JSON,
     },
     assert: async (rig, _result) => {
       const toolLogs = rig.readToolLogs() as Array<{
@@ -164,15 +169,7 @@ describe('subagent eval test cases', () => {
       ...TEST_AGENTS.TESTING_AGENT.asFile(),
       'index.ts': INDEX_TS,
       'README.md': 'TODO: update the README.\n',
-      'package.json': JSON.stringify(
-        {
-          name: 'subagent-eval-project',
-          version: '1.0.0',
-          type: 'module',
-        },
-        null,
-        2,
-      ),
+      'package.json': MOCK_PACKAGE_JSON,
     },
     assert: async (rig, _result) => {
       const toolLogs = rig.readToolLogs() as Array<{
@@ -190,4 +187,105 @@ describe('subagent eval test cases', () => {
       );
     },
   });
+
+  /**
+   * Checks that the main agent can correctly select the appropriate subagent
+   * from a large pool of available subagents (10 total).
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should select the correct subagent from a pool of 10 different agents',
+    prompt: 'Please add a new SQL table migration for a user profile.',
+    files: {
+      ...TEST_AGENTS.DOCS_AGENT.asFile(),
+      ...TEST_AGENTS.TESTING_AGENT.asFile(),
+      ...TEST_AGENTS.DATABASE_AGENT.asFile(),
+      ...TEST_AGENTS.CSS_AGENT.asFile(),
+      ...TEST_AGENTS.I18N_AGENT.asFile(),
+      ...TEST_AGENTS.SECURITY_AGENT.asFile(),
+      ...TEST_AGENTS.DEVOPS_AGENT.asFile(),
+      ...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
+      ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
+      ...TEST_AGENTS.MOBILE_AGENT.asFile(),
+      'package.json': MOCK_PACKAGE_JSON,
+    },
+    assert: async (rig, _result) => {
+      const toolLogs = rig.readToolLogs() as Array<{
+        toolRequest: { name: string };
+      }>;
+      await rig.expectToolCallSuccess(['database-agent']);
+
+      // Ensure the generalist and other irrelevant specialists were not invoked
+      const uncalledAgents = [
+        'generalist',
+        TEST_AGENTS.DOCS_AGENT.name,
+        TEST_AGENTS.TESTING_AGENT.name,
+        TEST_AGENTS.CSS_AGENT.name,
+        TEST_AGENTS.I18N_AGENT.name,
+        TEST_AGENTS.SECURITY_AGENT.name,
+        TEST_AGENTS.DEVOPS_AGENT.name,
+        TEST_AGENTS.ANALYTICS_AGENT.name,
+        TEST_AGENTS.ACCESSIBILITY_AGENT.name,
+        TEST_AGENTS.MOBILE_AGENT.name,
+      ];
+
+      for (const agentName of uncalledAgents) {
+        expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
+          false,
+        );
+      }
+    },
+  });
+
+  /**
+   * Checks that the main agent can correctly select the appropriate subagent
+   * from a large pool of available subagents, even when many irrelevant MCP tools are present.
+   *
+   * This test includes stress tests the subagent delegation with ~80 tools.
+   */
+  evalTest('USUALLY_PASSES', {
+    name: 'should select the correct subagent from a pool of 10 different agents with MCP tools present',
+    prompt: 'Please add a new SQL table migration for a user profile.',
+    setup: async (rig) => {
+      rig.addTestMcpServer('workspace-server', 'google-workspace');
+    },
+    files: {
+      ...TEST_AGENTS.DOCS_AGENT.asFile(),
+      ...TEST_AGENTS.TESTING_AGENT.asFile(),
+      ...TEST_AGENTS.DATABASE_AGENT.asFile(),
+      ...TEST_AGENTS.CSS_AGENT.asFile(),
+      ...TEST_AGENTS.I18N_AGENT.asFile(),
+      ...TEST_AGENTS.SECURITY_AGENT.asFile(),
+      ...TEST_AGENTS.DEVOPS_AGENT.asFile(),
+      ...TEST_AGENTS.ANALYTICS_AGENT.asFile(),
+      ...TEST_AGENTS.ACCESSIBILITY_AGENT.asFile(),
+      ...TEST_AGENTS.MOBILE_AGENT.asFile(),
+      'package.json': MOCK_PACKAGE_JSON,
+    },
+    assert: async (rig, _result) => {
+      const toolLogs = rig.readToolLogs() as Array<{
+        toolRequest: { name: string };
+      }>;
+      await rig.expectToolCallSuccess(['database-agent']);
+
+      // Ensure the generalist and other irrelevant specialists were not invoked
+      const uncalledAgents = [
+        'generalist',
+        TEST_AGENTS.DOCS_AGENT.name,
+        TEST_AGENTS.TESTING_AGENT.name,
+        TEST_AGENTS.CSS_AGENT.name,
+        TEST_AGENTS.I18N_AGENT.name,
+        TEST_AGENTS.SECURITY_AGENT.name,
+        TEST_AGENTS.DEVOPS_AGENT.name,
+        TEST_AGENTS.ANALYTICS_AGENT.name,
+        TEST_AGENTS.ACCESSIBILITY_AGENT.name,
+        TEST_AGENTS.MOBILE_AGENT.name,
+      ];
+
+      for (const agentName of uncalledAgents) {
+        expect(toolLogs.some((l) => l.toolRequest.name === agentName)).toBe(
+          false,
+        );
+      }
+    },
+  });
 });
diff --git a/evals/test-helper.ts b/evals/test-helper.ts
@@ -61,6 +61,10 @@ export async function internalEvalTest(evalCase: EvalCase) {
     try {
       rig.setup(evalCase.name, evalCase.params);
 
+      if (evalCase.setup) {
+        await evalCase.setup(rig);
+      }
+
       if (evalCase.files) {
         await setupTestFiles(rig, evalCase.files);
       }
@@ -371,6 +375,7 @@ export interface EvalCase {
   prompt: string;
   timeout?: number;
   files?: Record<string, string>;
+  setup?: (rig: TestRig) => Promise<void> | void;
   /** Conversation history to pre-load via --resume. Each entry is a message object with type, content, etc. */
   messages?: Record<string, unknown>[];
   /** Session ID for the resumed session. Auto-generated if not provided. */
diff --git a/packages/test-utils/src/fixtures/agents.ts b/packages/test-utils/src/fixtures/agents.ts
@@ -69,4 +69,84 @@ export const TEST_AGENTS = {
     tools: ['read_file', 'write_file'],
     body: 'You are the test agent. Add or update tests.',
   }),
+  /**
+   * An agent with expertise in database schemas, SQL, and creating database migrations.
+   */
+  DATABASE_AGENT: createAgent({
+    name: 'database-agent',
+    description:
+      'An expert in database schemas, SQL, and creating database migrations.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the database agent. Create and update SQL migrations.',
+  }),
+
+  /**
+   * An agent with expertise in CSS, styling, and UI design.
+   */
+  CSS_AGENT: createAgent({
+    name: 'css-agent',
+    description: 'An expert in CSS, styling, and UI design.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the CSS agent.',
+  }),
+
+  /**
+   * An agent with expertise in internationalization and translations.
+   */
+  I18N_AGENT: createAgent({
+    name: 'i18n-agent',
+    description: 'An expert in internationalization and translations.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the i18n agent.',
+  }),
+
+  /**
+   * An agent with expertise in security audits and vulnerability patches.
+   */
+  SECURITY_AGENT: createAgent({
+    name: 'security-agent',
+    description: 'An expert in security audits and vulnerability patches.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the security agent.',
+  }),
+
+  /**
+   * An agent with expertise in CI/CD, Docker, and deployment scripts.
+   */
+  DEVOPS_AGENT: createAgent({
+    name: 'devops-agent',
+    description: 'An expert in CI/CD, Docker, and deployment scripts.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the devops agent.',
+  }),
+
+  /**
+   * An agent with expertise in tracking, analytics, and metrics.
+   */
+  ANALYTICS_AGENT: createAgent({
+    name: 'analytics-agent',
+    description: 'An expert in tracking, analytics, and metrics.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the analytics agent.',
+  }),
+
+  /**
+   * An agent with expertise in web accessibility and ARIA roles.
+   */
+  ACCESSIBILITY_AGENT: createAgent({
+    name: 'accessibility-agent',
+    description: 'An expert in web accessibility and ARIA roles.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the accessibility agent.',
+  }),
+
+  /**
+   * An agent with expertise in React Native and mobile app development.
+   */
+  MOBILE_AGENT: createAgent({
+    name: 'mobile-agent',
+    description: 'An expert in React Native and mobile app development.',
+    tools: ['read_file', 'write_file'],
+    body: 'You are the mobile agent.',
+  }),
 } as const;