Skip to content

Commit 43ca763

Browse files
committed
add test_servers.py to verify that paid api-key-needing services are working and valid
1 parent 647b8bd commit 43ca763

File tree

2 files changed

+387
-0
lines changed

2 files changed

+387
-0
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,11 @@ Note: Some services are paid and require billing setup.
202202
curl -s http://localhost:1984/enabled-servers | jq -c
203203
```
204204

205+
To do a deeper check — verifying that each server can make a real API call and that all required keys are valid — run the health-check script. It tests all 36 servers and reports pass/fail, flagging any keys that are missing from `.env`.
206+
```bash
207+
uv run test_servers.py
208+
```
209+
205210
If the docker container does not shut down gracefully, use `docker ps` and `docker kill <ID>` to force it to shut down.
206211

207212
### 8. Evaluate with the full HuggingFace dataset

services/mcp_eval/test_servers.py

Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Health-check script for all MCP servers defined in mcp_server_template.json.
4+
Makes one representative call per server and reports pass/fail.
5+
6+
Server list and API-key requirements are derived automatically from the
7+
template file — no need to update this script when servers are added/removed.
8+
9+
Usage:
10+
uv run test_servers.py
11+
uv run test_servers.py --timeout 30
12+
uv run test_servers.py --concurrency 10
13+
uv run test_servers.py --server github # test a single server
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import argparse
19+
import asyncio
20+
import json
21+
import re
22+
import time
23+
from dataclasses import dataclass
24+
from pathlib import Path
25+
from typing import Any
26+
27+
import httpx
28+
29+
# ── Paths ────────────────────────────────────────────────────────────────────
30+
SCRIPT_DIR = Path(__file__).parent
31+
REPO_ROOT = SCRIPT_DIR.parents[1]
32+
TEMPLATE_PATH = (
33+
REPO_ROOT / "services/agent-environment/src/agent_environment/mcp_server_template.json"
34+
)
35+
ENV_PATH = REPO_ROOT / ".env"
36+
BASE_URL = "http://localhost:1984/call-tool"
37+
38+
39+
# ── Parse .env ───────────────────────────────────────────────────────────────
40+
def load_env_keys(env_path: Path) -> set[str]:
41+
"""Return the set of variable names that are set (non-empty) in .env."""
42+
if not env_path.exists():
43+
return set()
44+
keys: set[str] = set()
45+
for line in env_path.read_text().splitlines():
46+
line = line.strip()
47+
if not line or line.startswith("#") or "=" not in line:
48+
continue
49+
name, _, value = line.partition("=")
50+
if value.strip():
51+
keys.add(name.strip())
52+
return keys
53+
54+
55+
# ── Load server list from template ───────────────────────────────────────────
56+
def _extract_vars(server_cfg: dict) -> list[str]:
57+
"""Return all ${VAR_NAME} references found in a server config."""
58+
return re.findall(r"\$\{([A-Z_]+)\}", json.dumps(server_cfg))
59+
60+
61+
def _uses_api_key(server_cfg: dict) -> bool:
62+
return bool(_extract_vars(server_cfg))
63+
64+
65+
def load_servers() -> tuple[dict[str, bool], dict[str, list[str]]]:
66+
"""Return ({server: needs_key}, {server: [VAR_NAMES]}) from the template."""
67+
with open(TEMPLATE_PATH) as f:
68+
data = json.load(f)
69+
servers = data.get("mcpServers", {})
70+
needs_key = {name: _uses_api_key(cfg) for name, cfg in servers.items()}
71+
required_vars = {name: _extract_vars(cfg) for name, cfg in servers.items()}
72+
return needs_key, required_vars
73+
74+
75+
# ── Hardcoded test calls ──────────────────────────────────────────────────────
76+
# One simple, read-only call per server that exercises real functionality.
77+
# Key: server name exactly as it appears in mcp_server_template.json
78+
TEST_CALLS: dict[str, tuple[str, dict]] = {
79+
# No API key
80+
"arxiv": (
81+
"arxiv_search_papers",
82+
{"query": "machine learning", "max_results": 1},
83+
),
84+
"calculator": (
85+
"calculator_calculate",
86+
{"expression": "2 + 2"},
87+
),
88+
"cli-mcp-server": (
89+
"cli-mcp-server_run_command",
90+
{"command": "ls /data"},
91+
),
92+
"clinicaltrialsgov-mcp-server": (
93+
"clinicaltrialsgov-mcp-server_clinicaltrials_list_studies",
94+
{"query": {"term": "diabetes"}, "pageSize": 1},
95+
),
96+
"context7": (
97+
"context7_resolve-library-id",
98+
{"libraryName": "react"},
99+
),
100+
"ddg-search": (
101+
"ddg-search_search",
102+
{"query": "python programming"},
103+
),
104+
"desktop-commander": (
105+
"desktop-commander_list_directory",
106+
{"path": "/data"},
107+
),
108+
"fetch": (
109+
"fetch_fetch",
110+
{"url": "https://httpbin.org/get"},
111+
),
112+
"filesystem": (
113+
"filesystem_list_allowed_directories",
114+
{},
115+
),
116+
"git": (
117+
"git_git_status",
118+
{"repo_path": "/data/repos/mcp-server-calculator"},
119+
),
120+
"memory": (
121+
"memory_search_nodes",
122+
{"query": "test"},
123+
),
124+
"met-museum": (
125+
"met-museum_get-museum-object",
126+
{"objectId": 32907},
127+
),
128+
"mcp-code-executor": (
129+
"mcp-code-executor_execute_code",
130+
{"code": "print(1 + 1)"},
131+
),
132+
"mcp-server-code-runner": (
133+
"mcp-server-code-runner_run-code",
134+
{"languageId": "python", "code": "print(1 + 1)"},
135+
),
136+
"open-library": (
137+
"open-library_get_book_by_title",
138+
{"title": "Dune"},
139+
),
140+
"osm-mcp-server": (
141+
"osm-mcp-server_geocode_address",
142+
{"address": "New York City"},
143+
),
144+
"pubmed": (
145+
"pubmed_search_pubmed_key_words",
146+
{"key_words": "diabetes"},
147+
),
148+
"weather": (
149+
"weather_find_weather_stations",
150+
{"location": "48.0993244, -123.4256985"},
151+
),
152+
"whois": (
153+
"whois_whois_domain",
154+
{"domain": "example.com"},
155+
),
156+
"wikipedia": (
157+
"wikipedia_search_wikipedia",
158+
{"query": "python", "limit": 1},
159+
),
160+
161+
# Needs API key
162+
"airtable": (
163+
"airtable_list_bases",
164+
{},
165+
),
166+
"alchemy": (
167+
"alchemy_fetchTokenPriceBySymbol",
168+
{"symbols": ["ETH"]},
169+
),
170+
"brave-search": (
171+
"brave-search_brave_web_search",
172+
{"query": "latest AI news"},
173+
),
174+
"e2b-server": (
175+
"e2b-server_run_code",
176+
{"code": "print(1 + 1)"},
177+
),
178+
"exa": (
179+
"exa_web_search_exa",
180+
{"query": "python programming"},
181+
),
182+
"github": (
183+
"github_list_commits",
184+
{"owner": "torvalds", "repo": "subsurface"},
185+
),
186+
"google-maps": (
187+
"google-maps_maps_geocode",
188+
{"address": "New York City"},
189+
),
190+
"google-workspace": (
191+
"google-workspace_list_events",
192+
{"maxResults": 1},
193+
),
194+
"lara-translate": (
195+
"lara-translate_translate",
196+
{"text": [{"text": "Hello world", "translatable": True}], "target": "fr", "source": "en"},
197+
),
198+
"mongodb": (
199+
"mongodb_list-databases",
200+
{},
201+
),
202+
"national-parks": (
203+
"national-parks_findParks",
204+
{"q": "Yellowstone", "stateCode": "WY"},
205+
),
206+
"notion": (
207+
"notion_API-get-users",
208+
{},
209+
),
210+
"oxylabs": (
211+
"oxylabs_google_search_scraper",
212+
{"query": "python"},
213+
),
214+
"slack": (
215+
"slack_channels_list",
216+
{"channel_types": "public_channel"},
217+
),
218+
"twelvedata": (
219+
"twelvedata_GetPrice",
220+
{"params": {"symbol": "AAPL"}},
221+
),
222+
"weather-data": (
223+
"weather-data_weather_current",
224+
{"q": "London"},
225+
),
226+
}
227+
228+
229+
# ── Result dataclass ──────────────────────────────────────────────────────────
230+
@dataclass
231+
class Result:
232+
server: str
233+
needs_key: bool
234+
tool: str
235+
ok: bool
236+
elapsed: float
237+
status_code: int = 0
238+
preview: str = ""
239+
error: str = ""
240+
missing_keys: list[str] = None # env vars that were absent in .env
241+
242+
def __post_init__(self):
243+
if self.missing_keys is None:
244+
self.missing_keys = []
245+
246+
247+
# ── Per-request logic ─────────────────────────────────────────────────────────
248+
async def run_test(
249+
client: httpx.AsyncClient,
250+
server: str,
251+
needs_key: bool,
252+
tool: str,
253+
arguments: dict[str, Any],
254+
timeout: float,
255+
) -> Result:
256+
payload = {"tool_name": tool, "tool_args": arguments}
257+
t0 = time.monotonic()
258+
try:
259+
resp = await client.post(BASE_URL, json=payload, timeout=timeout)
260+
elapsed = time.monotonic() - t0
261+
body = resp.text
262+
ok = resp.status_code < 300
263+
264+
# Detect tool-level errors: MCP tools return [{type:text, text:"Error: ..."}]
265+
if ok:
266+
try:
267+
data = resp.json()
268+
if isinstance(data, dict) and "error" in str(data).lower():
269+
ok = False
270+
elif isinstance(data, list):
271+
for item in data:
272+
if isinstance(item, dict):
273+
text = item.get("text", "")
274+
if isinstance(text, str) and text.startswith("Error:"):
275+
ok = False
276+
break
277+
except Exception:
278+
pass
279+
280+
preview = body.replace("\n", " ").strip()[:120]
281+
return Result(server, needs_key, tool, ok, elapsed,
282+
status_code=resp.status_code, preview=preview)
283+
except httpx.TimeoutException:
284+
elapsed = time.monotonic() - t0
285+
return Result(server, needs_key, tool, False, elapsed,
286+
error=f"Timed out after {timeout}s")
287+
except Exception as exc:
288+
elapsed = time.monotonic() - t0
289+
return Result(server, needs_key, tool, False, elapsed, error=str(exc))
290+
291+
292+
# ── Main ──────────────────────────────────────────────────────────────────────
293+
async def main(timeout: float, concurrency: int, only_server: str | None) -> None:
294+
servers, required_vars = load_servers()
295+
env_keys = load_env_keys(ENV_PATH)
296+
total = len(servers)
297+
298+
# Warn about any servers in the template that lack a test call
299+
no_test = [s for s in servers if s not in TEST_CALLS]
300+
if no_test:
301+
print(f"\n⚠️ No test call defined for: {', '.join(no_test)}")
302+
print(" Add entries to TEST_CALLS in this script to cover them.\n")
303+
304+
# Build the list of tests to run
305+
tests = [
306+
(name, servers[name], *TEST_CALLS[name])
307+
for name in servers
308+
if name in TEST_CALLS and (only_server is None or name == only_server)
309+
]
310+
311+
sem = asyncio.Semaphore(concurrency)
312+
313+
async def bounded(client: httpx.AsyncClient, *args: Any) -> Result:
314+
async with sem:
315+
return await run_test(client, *args)
316+
317+
async with httpx.AsyncClient() as client:
318+
tasks = [bounded(client, *t, timeout) for t in tests]
319+
results: list[Result] = await asyncio.gather(*tasks)
320+
321+
# Annotate failed results with any missing .env keys
322+
for r in results:
323+
if not r.ok:
324+
r.missing_keys = [
325+
v for v in required_vars.get(r.server, [])
326+
if v not in env_keys
327+
]
328+
329+
# ── Print results ─────────────────────────────────────────────────────────
330+
no_key = [r for r in results if not r.needs_key]
331+
with_key = [r for r in results if r.needs_key]
332+
333+
def render_group(title: str, group: list[Result]) -> None:
334+
if not group:
335+
return
336+
print(f"\n{'━' * 72}")
337+
print(f" {title}")
338+
print(f"{'━' * 72}")
339+
for r in sorted(group, key=lambda x: x.server):
340+
icon = "✅" if r.ok else "❌"
341+
timing = f"{r.elapsed:.1f}s"
342+
if r.ok:
343+
detail = r.preview[:58]
344+
elif r.missing_keys:
345+
detail = f"not set in .env: {', '.join(r.missing_keys)}"
346+
else:
347+
detail = (r.error or r.preview)[:58]
348+
print(f" {icon} {r.server:<30} {timing:>6} {detail}")
349+
350+
render_group("No API key required", no_key)
351+
render_group("API key required", with_key)
352+
353+
passed = sum(1 for r in results if r.ok)
354+
tested = len(results)
355+
failed = [r for r in results if not r.ok]
356+
357+
print(f"\n{'━' * 72}")
358+
if only_server:
359+
print(f" Result: {passed}/{tested} passed (filtered to '{only_server}')", end="")
360+
else:
361+
print(f" Result: {passed}/{total} passed ({tested} tested, {total - tested} no test defined)", end="")
362+
if failed:
363+
print(f"\n Failed: {', '.join(r.server for r in failed)}")
364+
else:
365+
print(" 🎉 All clear!")
366+
print(f"{'━' * 72}\n")
367+
368+
if failed:
369+
raise SystemExit(1)
370+
371+
372+
if __name__ == "__main__":
373+
parser = argparse.ArgumentParser(description="Test all MCP servers from the template")
374+
parser.add_argument("--timeout", type=float, default=30,
375+
help="Per-request timeout in seconds (default: 30)")
376+
parser.add_argument("--concurrency", type=int, default=8,
377+
help="Max parallel requests (default: 8)")
378+
parser.add_argument("--server", metavar="NAME",
379+
help="Test only this server (e.g. --server github)")
380+
args = parser.parse_args()
381+
382+
asyncio.run(main(args.timeout, args.concurrency, args.server))

0 commit comments

Comments
 (0)