disable system prompt. variable eval pass-threshold. improve readme.

geobio · geobio · commit 60bf0c9c42a6 · 2025-12-18T16:46:45.000-06:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 MCP-Atlas is a comprehensive benchmark for evaluating AI models' tool-use capabilities across 36 Model Context Protocol (MCP) servers. It provides a standardized environment for running agent completions and evaluating performance with LLM-as-judge methodology.
 
-- Paper: [LINK TO PAPER - TODO]
+- Paper: [https://static.scale.com/uploads/674f4cc7a74e35bcaae1c29a/MCP_Atlas.pdf](https://static.scale.com/uploads/674f4cc7a74e35bcaae1c29a/MCP_Atlas.pdf) or ([local copy](assets/MCP_Atlas.pdf))
 - Leaderboard: [https://scale.com/leaderboard/mcp_atlas](https://scale.com/leaderboard/mcp_atlas)
 - Dataset: [https://huggingface.co/datasets/ScaleAI/MCP-Atlas](https://huggingface.co/datasets/ScaleAI/MCP-Atlas)
 
@@ -53,6 +53,8 @@ We use [LiteLLM](https://docs.litellm.ai/) to support 100+ LLMs via a unified AP
 
 ### 2. Start the MCP servers
 
+**Note: Allocate at least 8GB memory to Docker (10GB+ recommended).**
+
 **Option A: Use pre-built image (faster, recommended):**
 
 ```bash
@@ -163,6 +165,7 @@ Options:
 - `--evaluator-model` - Override model (default: `EVAL_LLM_MODEL` env var or `gemini/gemini-2.5-pro`)
 - `--num-tasks` - Limit to first N tasks
 - `--concurrency` - Concurrent API requests (default: 5)
+- `--pass-threshold` - Coverage score threshold for pass rate calculation (default: 0.75)
 
 Outputs saved to `evaluation_results/`:
 - `scored_gpt51.csv` - Coverage scores for each task. On Mac, "Numbers" app works better to open CSV files with multi-line rows.
@@ -227,7 +230,9 @@ uv run mcp_evals_scores.py \
 
 ### 9. Evaluate other models
 
-To benchmark other models, repeat step 8 with a different `--model` and `--output`:
+To benchmark other models, repeat step 8 with a different `--model` and `--output`.
+
+If you are changing `LLM_API_KEY` you'll also have to restart `make run-mcp-completion`.
 
 See [LiteLLM's supported models](https://docs.litellm.ai/docs/providers) for the full list of available providers and model names. For self-hosted models, change `LLM_BASE_URL`.
 
diff --git a/assets/MCP_Atlas.pdf b/assets/MCP_Atlas.pdf
diff --git a/env.template b/env.template
@@ -93,6 +93,10 @@ EVAL_LLM_BASE_URL=
 LLM_API_KEY=
 LLM_BASE_URL=
 
+# System prompt for agent completion (default: false, no system prompt)
+# With system prompt enabled, pass rate is ~10% higher (see SYSTEM_PROMPT in mcp_completion_script.py)
+USE_SYSTEM_PROMPT_IN_COMPLETION=
+
 
 
 
diff --git a/services/mcp_eval/mcp_completion_script.py b/services/mcp_eval/mcp_completion_script.py
@@ -69,8 +69,9 @@ def get_retry_delay(attempt: int) -> float:
     jitter = delay * random.uniform(0, 0.5)
     return delay + jitter
 
-# System prompt for the model
+# System prompt for the model (only used if USE_SYSTEM_PROMPT_IN_COMPLETION=true)
 SYSTEM_PROMPT = "Role: You are a factual, tool-aware assistant connected to a variety of tools. Use the available tools to answer the user query. Do not ask the user for clarification; fully complete the task using the information provided in the prompt."
+USE_SYSTEM_PROMPT = os.getenv("USE_SYSTEM_PROMPT_IN_COMPLETION", "").lower() == "true"
 
 @dataclass
 class ToolCall:
@@ -208,12 +209,14 @@ async def run_live_task_async(self, enabled_tools: List[str], user_prompt: str,
         def uuid14():
             return str(uuid.uuid4()).replace('-', '')[-14:]
 
+        messages = []
+        if USE_SYSTEM_PROMPT:
+            messages.append({"role": "system", "content": SYSTEM_PROMPT})
+        messages.append({"role": "user", "content": user_prompt})
+        
         payload = {
             "model": self.llm_model,
-            "messages": [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_prompt}
-            ],
+            "messages": messages,
             "enabledTools": enabled_tools,
             "enableThinkingTokens": False,
         }
diff --git a/services/mcp_eval/mcp_evals_scores.py b/services/mcp_eval/mcp_evals_scores.py
@@ -79,7 +79,7 @@ def get_litellm_config():
     if not api_key:
         raise ValueError("LiteLLM API key not found. Set EVAL_LLM_API_KEY or LLM_API_KEY env var.")
     
-    api_base = os.getenv("EVAL_LLM_BASE_URL") or os.getenv("LLM_BASE_URL", "")
+    api_base = os.getenv("EVAL_LLM_BASE_URL", "")
     return api_key, api_base
 
 
@@ -554,7 +554,7 @@ async def safe_evaluate(row_idx, row):
 # 3. STATISTICAL ANALYSIS AND PLOTTING
 # =========================================================================
 
-def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output_dir: str):
+def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output_dir: str, pass_threshold: float = 0.75):
     """Generates a summary stats CSV and a histogram plot of coverage scores."""
     logger = logging.getLogger(__name__)
     logger.info(f"Step 4: Generating statistics and plots for '{scored_csv_path}'...")
@@ -570,9 +570,9 @@ def generate_statistics_and_plots(scored_csv_path: str, model_label: str, output
         # Rename "mean" to "mean coverage score"
         stats_df.loc[stats_df["stat"] == "mean", "stat"] = "mean coverage score"
         
-        # Calculate pass rate (% of tasks where coverage_score >= 0.75)
+        # Calculate pass rate (% of tasks where coverage_score >= pass_threshold)
         valid_scores = df["coverage_score"].dropna()
-        pass_count = (valid_scores >= 0.75).sum()
+        pass_count = (valid_scores >= pass_threshold).sum()
         total_count = len(valid_scores)
         pass_rate = pass_count / total_count if total_count > 0 else 0.0
         
@@ -680,7 +680,7 @@ async def main(args):
         logger.info(f"Evaluation complete. Average coverage: {valid_scores.mean():.3f}")
 
         # 3. Generate statistics and plots
-        generate_statistics_and_plots(scored_path, args.model_label, output_dir)
+        generate_statistics_and_plots(scored_path, args.model_label, output_dir, args.pass_threshold)
 
         logger.info(f"\n🚀 Pipeline finished successfully!")
         logger.info(f"Results available in: {output_dir}")
@@ -710,6 +710,8 @@ async def main(args):
                        help="Number of concurrent requests to the LLM API.")
     parser.add_argument("--num-tasks", type=int, default=None, 
                        help="Limit evaluation to first N tasks (useful for testing). If not specified, processes all tasks.")
+    parser.add_argument("--pass-threshold", type=float, default=0.75,
+                       help="Coverage score threshold for pass rate calculation (default: 0.75)")
     
     args = parser.parse_args()