feat: add MCP test runner support for run_tests and get_test_results

abose · abose · commit 7f8b5f1c822b · 2026-02-20T18:20:06.000+05:30
Adds MCP tools to control the Phoenix test runner remotely: run test
suites by category/spec and poll structured results. Includes WS
protocol handlers, test-runner-side MCP script, and updated CLAUDE.md
with accurate suite naming guidance.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -24,3 +24,48 @@ Use `exec_js` to run JS in the Phoenix browser runtime. jQuery `$()` is global.
 **Click AI chat buttons:** `$('.ai-edit-restore-btn:contains("Undo")').click();`
 
 **Check logs:** `get_browser_console_logs` with `filter` regex (e.g. `"AI UI"`, `"error"`) and `tail` — includes both browser console and Node.js (PhNode) logs. Use `get_terminal_logs` for Electron process output (only available if Phoenix was launched via `start_phoenix`).
+
+## Running Tests via MCP
+
+The test runner must be open as a separate Phoenix instance (it shows up as `phoenix-test-runner-*` in `get_phoenix_status`). Use `run_tests` to trigger test runs and `get_test_results` to poll for results. `take_screenshot` also works on the test runner.
+
+### Test categories
+- **unit** — Fast, no UI. Safe to run all at once (`run_tests category=unit`).
+- **integration** — Spawns a Phoenix iframe inside the test runner. Some specs require window focus and will hang if the test runner window isn't focused.
+- **LegacyInteg** — Like integration but uses the legacy test harness. Also spawns an embedded Phoenix instance.
+- **livepreview**, **mainview** — Specialized integration tests.
+- **Do NOT use:** `all`, `performance`, `extension`, `individualrun` — not actively supported.
+
+### Hierarchy: Category → Suite → Test
+- **Category** — top-level grouping: `unit`, `integration`, `LegacyInteg`, etc. Safe to run an entire category.
+- **Suite** — a group of related tests within a category (e.g. `integration: FileFilters` has ~20 tests). This is the `spec` parameter value.
+- **Test** — a single test within a suite.
+
+### Running all tests in a category
+```
+run_tests(category="unit")
+```
+
+### Running a single suite
+Pass the exact suite name as the `spec` parameter. **Suite names do NOT always have a category prefix.** Many suites are registered with just their plain name (e.g. `"CSS Parsing"`, `"Editor"`, `"JSUtils"`), while others include a prefix (e.g. `"unit:Phoenix Platform Tests"`, `"integration: FileFilters"`, `"LegacyInteg:ExtensionLoader"`). If the suite name is wrong, the test runner will show a blank page with 0 specs and appear stuck.
+
+**To discover the exact suite name**, run this in `exec_js` on the test runner instance:
+```js
+return jasmine.getEnv().topSuite().children.map(s => s.description);
+```
+
+Examples:
+```
+run_tests(category="unit", spec="CSS Parsing")
+run_tests(category="unit", spec="unit:Phoenix Platform Tests")
+run_tests(category="integration", spec="integration: FileFilters")
+run_tests(category="LegacyInteg", spec="LegacyInteg:ExtensionLoader")
+```
+
+### Running individual tests
+You can pass a specific test's full name as `spec` to run just that one test. It is perfectly valid to run a single test. However, if a single test fails, re-run the full suite to confirm — suites sometimes execute tests in order with shared state, so an individual test may fail in isolation but pass within its suite. If the suite passes, the test is valid.
+
+### Gotchas
+- **Instance name changes on reload:** The test runner gets a new random instance name each time the page reloads. Always check `get_phoenix_status` after a `run_tests` call to get the current instance name.
+- **Integration tests may hang:** Specs labeled "needs window focus" will hang indefinitely if the test runner doesn't have OS-level window focus. If `get_test_results` starts timing out, the event loop is likely blocked by a stuck spec — use `force_reload_phoenix` to recover.
+- **LegacyInteg/integration tests spawn an iframe:** These tests open an embedded Phoenix instance inside the test runner, so they are slower and more resource-intensive than unit tests.
diff --git a/phoenix-builder-mcp/mcp-tools.js b/phoenix-builder-mcp/mcp-tools.js
@@ -383,6 +383,81 @@ export function registerTools(server, processManager, wsControlServer, phoenixDe
         }
     );
 
+    server.tool(
+        "run_tests",
+        "Run tests in the Phoenix test runner (SpecRunner.html). Reloads the test runner with the specified " +
+        "category and optional spec filter. The test runner must already be open in a browser with MCP enabled. " +
+        "Supported categories: unit, integration, LegacyInteg, livepreview, mainview. " +
+        "WARNING: Do NOT use 'all', 'performance', 'extension', or 'individualrun' categories — they are " +
+        "not actively supported and the full 'all' suite should never be run. " +
+        "To run all tests in a category, omit the spec parameter. " +
+        "To run a single suite, pass the suite name as spec (e.g. spec='unit: HTML Code Hinting'). " +
+        "Suite names are prefixed with the category and a colon, e.g. 'unit: Editor', 'unit: CSS Parsing'. " +
+        "You can also run individual specs by passing the full spec name, but note that individual specs " +
+        "may fail when run alone because suites often run tests in order with shared state — prefer " +
+        "running the full suite instead of individual specs. " +
+        "After calling run_tests, use get_test_results to poll for results.",
+        {
+            category: z.string().describe("Test category to run: unit, integration, LegacyInteg, livepreview, or mainview."),
+            spec: z.string().optional().describe("Optional suite or spec name to run within the category. " +
+                "Use the full name including category prefix, e.g. 'unit: CSS Parsing' for a suite. " +
+                "Prefer running full suites over individual specs, as specs may depend on suite execution order. " +
+                "Omit to run all tests in the category."),
+            instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.")
+        },
+        async ({ category, spec, instance }) => {
+            try {
+                const result = await wsControlServer.requestRunTests(category, spec, instance);
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({
+                            success: true,
+                            message: result.message || "Test runner is reloading with category=" + category
+                        })
+                    }]
+                };
+            } catch (err) {
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({ error: err.message })
+                    }]
+                };
+            }
+        }
+    );
+
+    server.tool(
+        "get_test_results",
+        "Get structured test results from the Phoenix test runner. Returns running status, pass/fail counts, " +
+        "failure details, and the currently executing spec. The test runner must already be open with MCP enabled.",
+        {
+            instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.")
+        },
+        async ({ instance }) => {
+            try {
+                const result = await wsControlServer.requestTestResults(instance);
+                // Remove internal WS fields
+                delete result.type;
+                delete result.id;
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify(result, null, 2)
+                    }]
+                };
+            } catch (err) {
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({ error: err.message })
+                    }]
+                };
+            }
+        }
+    );
+
     server.tool(
         "get_phoenix_status",
         "Check the status of the Phoenix process and WebSocket connection.",
diff --git a/phoenix-builder-mcp/ws-control-server.js b/phoenix-builder-mcp/ws-control-server.js
@@ -109,6 +109,28 @@ export function createWSControlServer(port) {
                     break;
                 }
 
+                case "run_tests_response": {
+                    const pendingRt = pendingRequests.get(msg.id);
+                    if (pendingRt) {
+                        pendingRequests.delete(msg.id);
+                        if (msg.success) {
+                            pendingRt.resolve({ success: true, message: msg.message });
+                        } else {
+                            pendingRt.reject(new Error(msg.message || "run_tests failed"));
+                        }
+                    }
+                    break;
+                }
+
+                case "get_test_results_response": {
+                    const pendingTr = pendingRequests.get(msg.id);
+                    if (pendingTr) {
+                        pendingRequests.delete(msg.id);
+                        pendingTr.resolve(msg);
+                    }
+                    break;
+                }
+
                 case "reload_response": {
                     const pending3 = pendingRequests.get(msg.id);
                     if (pending3) {
@@ -390,6 +412,80 @@ export function createWSControlServer(port) {
         });
     }
 
+    function requestRunTests(category, spec, instanceName) {
+        return new Promise((resolve, reject) => {
+            const resolved = _resolveClient(instanceName);
+            if (resolved.error) {
+                reject(new Error(resolved.error));
+                return;
+            }
+
+            const { client } = resolved;
+            if (client.ws.readyState !== 1) {
+                reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected"));
+                return;
+            }
+
+            const id = ++requestIdCounter;
+            const timeout = setTimeout(() => {
+                pendingRequests.delete(id);
+                reject(new Error("run_tests request timed out (30s)"));
+            }, 30000);
+
+            pendingRequests.set(id, {
+                resolve: (data) => {
+                    clearTimeout(timeout);
+                    resolve(data);
+                },
+                reject: (err) => {
+                    clearTimeout(timeout);
+                    reject(err);
+                }
+            });
+
+            const msg = { type: "run_tests_request", id, category };
+            if (spec) {
+                msg.spec = spec;
+            }
+            client.ws.send(JSON.stringify(msg));
+        });
+    }
+
+    function requestTestResults(instanceName) {
+        return new Promise((resolve, reject) => {
+            const resolved = _resolveClient(instanceName);
+            if (resolved.error) {
+                reject(new Error(resolved.error));
+                return;
+            }
+
+            const { client } = resolved;
+            if (client.ws.readyState !== 1) {
+                reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected"));
+                return;
+            }
+
+            const id = ++requestIdCounter;
+            const timeout = setTimeout(() => {
+                pendingRequests.delete(id);
+                reject(new Error("get_test_results request timed out (30s)"));
+            }, 30000);
+
+            pendingRequests.set(id, {
+                resolve: (data) => {
+                    clearTimeout(timeout);
+                    resolve(data);
+                },
+                reject: (err) => {
+                    clearTimeout(timeout);
+                    reject(err);
+                }
+            });
+
+            client.ws.send(JSON.stringify({ type: "get_test_results_request", id }));
+        });
+    }
+
     function getBrowserLogs(sinceLast, instanceName) {
         const resolved = _resolveClient(instanceName);
         if (resolved.error) {
@@ -442,6 +538,8 @@ export function createWSControlServer(port) {
         requestLogs,
         requestExecJs,
         requestExecJsLivePreview,
+        requestRunTests,
+        requestTestResults,
         getBrowserLogs,
         clearBrowserLogs,
         isClientConnected,
diff --git a/src/phoenix-builder/phoenix-builder-boot.js b/src/phoenix-builder/phoenix-builder-boot.js
@@ -90,7 +90,8 @@
         let name = sessionStorage.getItem(INSTANCE_NAME_KEY);
         if (!name) {
             const hex = Math.floor(Math.random() * 0x10000).toString(16).padStart(4, "0");
-            name = "phoenix-" + _getPlatformTag() + "-" + hex;
+            const prefix = window._phoenixBuilderNamePrefix || "phoenix";
+            name = prefix + "-" + _getPlatformTag() + "-" + hex;
             sessionStorage.setItem(INSTANCE_NAME_KEY, name);
         }
         return name;
diff --git a/test/SpecRunner.html b/test/SpecRunner.html
@@ -394,6 +394,10 @@
     }());
   </script>
 
+  <script>window._phoenixBuilderNamePrefix = "phoenix-test-runner";</script>
+  <script src="../src/phoenix-builder/phoenix-builder-boot.js"></script>
+  <script src="phoenix-test-runner-mcp.js"></script>
+
   <script src="../src/phoenix/shell.js" type="module"></script>
   <script src="virtual-server-loader.js" type="module"></script>
   <script src="../src/node-loader.js" defer></script>
diff --git a/test/SpecRunner.js b/test/SpecRunner.js
@@ -484,6 +484,7 @@ define(function (require, exports, module) {
             // Create the reporter, which is really a model class that just gathers
             // spec and performance data.
             reporter = new UnitTestReporter(jasmineEnv, params.get("spec"), selectedCategories);
+            window._unitTestReporter = reporter;
             SpecRunnerUtils.setUnitTestReporter(reporter);
 
             // Optionally emit JUnit XML file for automated runs
diff --git a/test/phoenix-test-runner-mcp.js b/test/phoenix-test-runner-mcp.js