Skip to content

Commit 546b953

Browse files
committed
refact: mcpbench -> mcpmark, deprecate openai trace
1 parent bb3bb9d commit 546b953

File tree

116 files changed

+7959
-4566
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+7959
-4566
lines changed

README.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@ Before running MCPMark you need to prepare the environment for the MCP service y
1717
All environment variables **must** be set in a file named `.mcp_env` in your project root. Example:
1818

1919
```env
20-
# For OpenAI API tracing (optional)
21-
OPENAI_TRACE_API_KEY="sk-proj-xxx-xxx-xx"
22-
2320
# Service Credentials
2421
## Notion
2522
SOURCE_NOTION_API_KEY="your-source-notion-api-key" # For Source Hub (templates)

pipeline.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,16 @@
2424

2525
def main():
2626
"""Main entry point for the evaluation pipeline."""
27-
parser = argparse.ArgumentParser(
28-
description="MCPMark Unified Evaluation Pipeline."
29-
)
27+
parser = argparse.ArgumentParser(description="MCPMark Unified Evaluation Pipeline.")
3028

31-
supported_services = MCPServiceFactory.get_supported_services()
29+
supported_mcp_services = MCPServiceFactory.get_supported_mcp_services()
3230
supported_models = ModelConfig.get_supported_models()
3331

3432
# Main configuration
3533
parser.add_argument(
3634
"--mcp",
3735
default="notion",
38-
choices=supported_services,
36+
choices=supported_mcp_services,
3937
help="MCP service to use (default: notion)",
4038
)
4139
parser.add_argument(
@@ -59,7 +57,6 @@ def main():
5957
"--timeout", type=int, default=300, help="Timeout in seconds for each task"
6058
)
6159

62-
6360
# Output configuration
6461
parser.add_argument(
6562
"--output-dir",
@@ -71,7 +68,7 @@ def main():
7168
# Load arguments and environment variables
7269
args = parser.parse_args()
7370
load_dotenv(dotenv_path=".mcp_env", override=False)
74-
71+
7572
# Generate default exp-name if not provided
7673
if args.exp_name is None:
7774
args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
@@ -81,13 +78,17 @@ def main():
8178
model_list = [m.strip() for m in args.models.split(",") if m.strip()]
8279
if not model_list:
8380
parser.error("No valid models provided")
84-
81+
8582
# Log warning for unsupported models but don't error
8683
unsupported_models = [m for m in model_list if m not in supported_models]
8784
if unsupported_models:
88-
logger.warning(f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment.")
85+
logger.warning(
86+
f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment."
87+
)
8988

90-
logger.info(f"Running evaluation for {len(model_list)} model(s): {', '.join(model_list)}")
89+
logger.info(
90+
f"Running evaluation for {len(model_list)} model(s): {', '.join(model_list)}"
91+
)
9192

9293
# Run evaluation for each model
9394
for i, model in enumerate(model_list, 1):
@@ -97,17 +98,19 @@ def main():
9798

9899
# Initialize and run the evaluation pipeline for this model
99100
pipeline = MCPEvaluator(
100-
service=args.mcp,
101+
mcp_service=args.mcp,
101102
model=model,
102103
timeout=args.timeout,
103104
exp_name=args.exp_name,
104105
output_dir=args.output_dir,
105106
)
106107

107108
pipeline.run_evaluation(args.tasks)
108-
logger.info(f"✓ Evaluation completed for {model}. Results saved in: {pipeline.base_experiment_dir}")
109+
logger.info(
110+
f"✓ Evaluation completed for {model}. Results saved in: {pipeline.base_experiment_dir}"
111+
)
109112

110-
logger.info(f"\n{'='*60}")
113+
logger.info(f"\n{'=' * 60}")
111114
logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
112115
logger.info(f"{'=' * 60}")
113116

results_parser.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ def validate_and_gather_metrics(
101101

102102
# Check pipeline errors
103103
error_msg = meta.get("execution_result", {}).get("error_message")
104-
if error_msg and any(err in error_msg for err in src.evaluator.PIPELINE_RETRY_ERRORS):
104+
if error_msg and any(
105+
err in error_msg for err in src.evaluator.PIPELINE_RETRY_ERRORS
106+
):
105107
has_retry_error = True
106108

107109
# Collect metrics
@@ -138,11 +140,15 @@ def validate_and_gather_metrics(
138140
return True, metrics, None
139141

140142

141-
def plot_metrics(metrics: Dict[str, Dict[str, float]], exp_name: str, service: str, show: bool):
143+
def plot_metrics(
144+
metrics: Dict[str, Dict[str, float]], exp_name: str, service: str, show: bool
145+
):
142146
"""Create a bar chart visualizing success rate and avg tokens; annotate avg turns."""
143147

144148
# Sort by success-rate (desc)
145-
sorted_items = sorted(metrics.items(), key=lambda x: x[1]["success_rate"], reverse=True)
149+
sorted_items = sorted(
150+
metrics.items(), key=lambda x: x[1]["success_rate"], reverse=True
151+
)
146152
models = [m for m, _ in sorted_items]
147153
success_rates = [item[1]["success_rate"] for item in sorted_items]
148154
avg_tokens = [item[1]["avg_tokens"] for item in sorted_items]
@@ -270,7 +276,9 @@ def main():
270276
# Discover expected tasks for this service
271277
expected_tasks = discover_all_tasks(args.mcp)
272278
if not expected_tasks:
273-
print(f"[ERROR] Could not discover any tasks for service '{args.mcp}'. Exiting.")
279+
print(
280+
f"[ERROR] Could not discover any tasks for service '{args.mcp}'. Exiting."
281+
)
274282
return
275283

276284
metrics: Dict[str, Dict[str, float]] = {}

0 commit comments

Comments
 (0)