Skip to content

Commit 9ba060d

Browse files
authored
✨ feat: add multiple metrics and aggregation helpers (#114)
1 parent e143cee commit 9ba060d

File tree

7 files changed

+1478
-347
lines changed

7 files changed

+1478
-347
lines changed

pipeline.py

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,16 @@ def main():
5151
default=None,
5252
help="Experiment name; results are saved under results/<exp-name>/ (default: YYYY-MM-DD-HH-MM-SS)",
5353
)
54+
parser.add_argument(
55+
"--k",
56+
type=int,
57+
default=1,
58+
help="Number of evaluation runs for pass@k metrics (default: 1)",
59+
)
5460

5561
# Execution configuration
5662
parser.add_argument(
57-
"--timeout", type=int, default=1000, help="Timeout in seconds for each task"
63+
"--timeout", type=int, default=3600, help="Timeout in seconds for each task"
5864
)
5965

6066
# Output configuration
@@ -69,6 +75,10 @@ def main():
6975
args = parser.parse_args()
7076
load_dotenv(dotenv_path=".mcp_env", override=False)
7177

78+
# Validate k parameter and exp-name requirement
79+
if args.k > 1 and args.exp_name is None:
80+
parser.error("--exp-name is required when k > 1")
81+
7282
# Generate default exp-name if not provided
7383
if args.exp_name is None:
7484
args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
@@ -87,30 +97,54 @@ def main():
8797

8898
logger.info("MCPMark Evaluation")
8999
logger.info(f"Experiment: {args.exp_name} | {len(model_list)} Model(s): {', '.join(model_list)}")
90-
91-
92-
# Run evaluation for each model
93-
for i, model in enumerate(model_list, 1):
94-
logger.info(f"\n{'=' * 60}")
95-
logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}")
96-
logger.info(f"{'=' * 60}\n")
97-
98-
# Initialize and run the evaluation pipeline for this model
99-
pipeline = MCPEvaluator(
100-
mcp_service=args.mcp,
101-
model=model,
102-
timeout=args.timeout,
103-
exp_name=args.exp_name,
104-
output_dir=args.output_dir,
105-
)
106-
107-
pipeline.run_evaluation(args.tasks)
108-
logger.info(
109-
f"📁 Results: {pipeline.base_experiment_dir}"
110-
)
100+
if args.k > 1:
101+
logger.info(f"Running {args.k} evaluation runs for pass@k metrics")
102+
103+
# Run k evaluation runs
104+
for run_idx in range(1, args.k + 1):
105+
if args.k > 1:
106+
logger.info(f"\n{'=' * 80}")
107+
logger.info(f"Starting Run {run_idx}/{args.k}")
108+
logger.info(f"{'=' * 80}\n")
109+
110+
# For k-runs, create run-N subdirectory
111+
run_exp_name = f"run-{run_idx}"
112+
run_output_dir = args.output_dir / args.exp_name
113+
else:
114+
# For single run (k=1), maintain backward compatibility
115+
# Use run-1 subdirectory for consistency
116+
run_exp_name = "run-1"
117+
run_output_dir = args.output_dir / args.exp_name
118+
119+
# Run evaluation for each model
120+
for i, model in enumerate(model_list, 1):
121+
logger.info(f"\n{'=' * 60}")
122+
if args.k > 1:
123+
logger.info(f"Run {run_idx}/{args.k} | Model {i}/{len(model_list)}: {model}")
124+
else:
125+
logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}")
126+
logger.info(f"{'=' * 60}\n")
127+
128+
# Initialize and run the evaluation pipeline for this model
129+
pipeline = MCPEvaluator(
130+
mcp_service=args.mcp,
131+
model=model,
132+
timeout=args.timeout,
133+
exp_name=run_exp_name,
134+
output_dir=run_output_dir,
135+
)
136+
137+
pipeline.run_evaluation(args.tasks)
138+
logger.info(
139+
f"📁 Results: {pipeline.base_experiment_dir}"
140+
)
111141

112142
logger.info(f"\n{'=' * 60}")
113-
logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
143+
if args.k > 1:
144+
logger.info(f"✓ All {args.k} runs completed for {len(model_list)} model(s)")
145+
logger.info(f"Run aggregate_results.py to compute pass@k metrics")
146+
else:
147+
logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
114148
logger.info(f"{'=' * 60}")
115149

116150

0 commit comments

Comments
 (0)