Skip to content

Commit 60bf0c9

Browse files
committed
disable system prompt. variable eval pass-threshold. improve readme.
1 parent b6c0271 commit 60bf0c9

File tree

5 files changed

+26
-12
lines changed

5 files changed

+26
-12
lines changed

README.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
MCP-Atlas is a comprehensive benchmark for evaluating AI models' tool-use capabilities across 36 Model Context Protocol (MCP) servers. It provides a standardized environment for running agent completions and evaluating performance with LLM-as-judge methodology.
44

5-
- Paper: [LINK TO PAPER - TODO]
5+
- Paper: [https://static.scale.com/uploads/674f4cc7a74e35bcaae1c29a/MCP_Atlas.pdf](https://static.scale.com/uploads/674f4cc7a74e35bcaae1c29a/MCP_Atlas.pdf) or ([local copy](assets/MCP_Atlas.pdf))
66
- Leaderboard: [https://scale.com/leaderboard/mcp_atlas](https://scale.com/leaderboard/mcp_atlas)
77
- Dataset: [https://huggingface.co/datasets/ScaleAI/MCP-Atlas](https://huggingface.co/datasets/ScaleAI/MCP-Atlas)
88

@@ -53,6 +53,8 @@ We use [LiteLLM](https://docs.litellm.ai/) to support 100+ LLMs via a unified AP
5353

5454
### 2. Start the MCP servers
5555

56+
**Note: Allocate at least 8GB memory to Docker (10GB+ recommended).**
57+
5658
**Option A: Use pre-built image (faster, recommended):**
5759

5860
```bash
@@ -163,6 +165,7 @@ Options:
163165
- `--evaluator-model` - Override model (default: `EVAL_LLM_MODEL` env var or `gemini/gemini-2.5-pro`)
164166
- `--num-tasks` - Limit to first N tasks
165167
- `--concurrency` - Concurrent API requests (default: 5)
168+
- `--pass-threshold` - Coverage score threshold for pass rate calculation (default: 0.75)
166169

167170
Outputs saved to `evaluation_results/`:
168171
- `scored_gpt51.csv` - Coverage scores for each task. On Mac, "Numbers" app works better to open CSV files with multi-line rows.
@@ -227,7 +230,9 @@ uv run mcp_evals_scores.py \
227230

228231
### 9. Evaluate other models
229232

230-
To benchmark other models, repeat step 8 with a different `--model` and `--output`:
233+
To benchmark other models, repeat step 8 with a different `--model` and `--output`.
234+
235+
If you are changing `LLM_API_KEY` you'll also have to restart `make run-mcp-completion`.
231236

232237
See [LiteLLM's supported models](https://docs.litellm.ai/docs/providers) for the full list of available providers and model names. For self-hosted models, change `LLM_BASE_URL`.
233238

assets/MCP_Atlas.pdf

842 KB
Binary file not shown.

env.template

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,10 @@ EVAL_LLM_BASE_URL=
9393
LLM_API_KEY=
9494
LLM_BASE_URL=
9595

96+
# System prompt for agent completion (default: false, no system prompt)
97+
# With system prompt enabled, pass rate is ~10% higher (see SYSTEM_PROMPT in mcp_completion_script.py)
98+
USE_SYSTEM_PROMPT_IN_COMPLETION=
99+
96100

97101

98102

services/mcp_eval/mcp_completion_script.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ def get_retry_delay(attempt: int) -> float:
6969
jitter = delay * random.uniform(0, 0.5)
7070
return delay + jitter
7171

72-
# System prompt for the model
72+
# System prompt for the model (only used if USE_SYSTEM_PROMPT_IN_COMPLETION=true)
7373
SYSTEM_PROMPT = "Role: You are a factual, tool-aware assistant connected to a variety of tools. Use the available tools to answer the user query. Do not ask the user for clarification; fully complete the task using the information provided in the prompt."
74+
USE_SYSTEM_PROMPT = os.getenv("USE_SYSTEM_PROMPT_IN_COMPLETION", "").lower() == "true"
7475

7576
@dataclass
7677
class ToolCall:
@@ -208,12 +209,14 @@ async def run_live_task_async(self, enabled_tools: List[str], user_prompt: str,
208209
def uuid14():
209210
return str(uuid.uuid4()).replace('-', '')[-14:]
210211

212+
messages = []
213+
if USE_SYSTEM_PROMPT:
214+
messages.append({"role": "system", "content": SYSTEM_PROMPT})
215+
messages.append({"role": "user", "content": user_prompt})
216+
211217
payload = {
212218
"model": self.llm_model,
213-
"messages": [
214-
{"role": "system", "content": SYSTEM_PROMPT},
215-
{"role": "user", "content": user_prompt}
216-
],
219+
"messages": messages,
217220
"enabledTools": enabled_tools,
218221
"enableThinkingTokens": False,
219222
}

services/mcp_eval/mcp_evals_scores.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def get_litellm_config():
7979
if not api_key:
8080
raise ValueError("LiteLLM API key not found. Set EVAL_LLM_API_KEY or LLM_API_KEY env var.")
8181

82-
api_base = os.getenv("EVAL_LLM_BASE_URL") or os.getenv("LLM_BASE_URL", "")
82+
api_base = os.getenv("EVAL_LLM_BASE_URL", "")
8383
return api_key, api_base
8484

8585

@@ -554,7 +554,7 @@ async def safe_evaluate(row_idx, row):
554554
# 3. STATISTICAL ANALYSIS AND PLOTTING
555555
# =========================================================================
556556

557-
def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output_dir: str):
557+
def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output_dir: str, pass_threshold: float = 0.75):
558558
"""Generates a summary stats CSV and a histogram plot of coverage scores."""
559559
logger = logging.getLogger(__name__)
560560
logger.info(f"Step 4: Generating statistics and plots for '{scored_csv_path}'...")
@@ -570,9 +570,9 @@ def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output
570570
# Rename "mean" to "mean coverage score"
571571
stats_df.loc[stats_df["stat"] == "mean", "stat"] = "mean coverage score"
572572

573-
# Calculate pass rate (% of tasks where coverage_score >= 0.75)
573+
# Calculate pass rate (% of tasks where coverage_score >= pass_threshold)
574574
valid_scores = df["coverage_score"].dropna()
575-
pass_count = (valid_scores >= 0.75).sum()
575+
pass_count = (valid_scores >= pass_threshold).sum()
576576
total_count = len(valid_scores)
577577
pass_rate = pass_count / total_count if total_count > 0 else 0.0
578578

@@ -680,7 +680,7 @@ async def main(args):
680680
logger.info(f"Evaluation complete. Average coverage: {valid_scores.mean():.3f}")
681681

682682
# 3. Generate statistics and plots
683-
generate_statistics_and_plots(scored_path, args.model_label, output_dir)
683+
generate_statistics_and_plots(scored_path, args.model_label, output_dir, args.pass_threshold)
684684

685685
logger.info(f"\n🚀 Pipeline finished successfully!")
686686
logger.info(f"Results available in: {output_dir}")
@@ -710,6 +710,8 @@ async def main(args):
710710
help="Number of concurrent requests to the LLM API.")
711711
parser.add_argument("--num-tasks", type=int, default=None,
712712
help="Limit evaluation to first N tasks (useful for testing). If not specified, processes all tasks.")
713+
parser.add_argument("--pass-threshold", type=float, default=0.75,
714+
help="Coverage score threshold for pass rate calculation (default: 0.75)")
713715

714716
args = parser.parse_args()
715717

0 commit comments

Comments
 (0)