@@ -51,10 +51,16 @@ def main():
5151 default = None ,
5252 help = "Experiment name; results are saved under results/<exp-name>/ (default: YYYY-MM-DD-HH-MM-SS)" ,
5353 )
54+ parser .add_argument (
55+ "--k" ,
56+ type = int ,
57+ default = 1 ,
58+ help = "Number of evaluation runs for pass@k metrics (default: 1)" ,
59+ )
5460
5561 # Execution configuration
5662 parser .add_argument (
57- "--timeout" , type = int , default = 1000 , help = "Timeout in seconds for each task"
63+ "--timeout" , type = int , default = 3600 , help = "Timeout in seconds for each task"
5864 )
5965
6066 # Output configuration
@@ -69,6 +75,10 @@ def main():
6975 args = parser .parse_args ()
7076 load_dotenv (dotenv_path = ".mcp_env" , override = False )
7177
78+ # Validate k parameter and exp-name requirement
79+ if args .k > 1 and args .exp_name is None :
80+ parser .error ("--exp-name is required when k > 1" )
81+
7282 # Generate default exp-name if not provided
7383 if args .exp_name is None :
7484 args .exp_name = datetime .now ().strftime ("%Y-%m-%d-%H-%M-%S" )
@@ -87,30 +97,54 @@ def main():
8797
8898 logger .info ("MCPMark Evaluation" )
8999 logger .info (f"Experiment: { args .exp_name } | { len (model_list )} Model(s): { ', ' .join (model_list )} " )
90-
91-
92- # Run evaluation for each model
93- for i , model in enumerate (model_list , 1 ):
94- logger .info (f"\n { '=' * 60 } " )
95- logger .info (f"Starting evaluation { i } /{ len (model_list )} : { model } " )
96- logger .info (f"{ '=' * 60 } \n " )
97-
98- # Initialize and run the evaluation pipeline for this model
99- pipeline = MCPEvaluator (
100- mcp_service = args .mcp ,
101- model = model ,
102- timeout = args .timeout ,
103- exp_name = args .exp_name ,
104- output_dir = args .output_dir ,
105- )
106-
107- pipeline .run_evaluation (args .tasks )
108- logger .info (
109- f"📁 Results: { pipeline .base_experiment_dir } "
110- )
100+ if args .k > 1 :
101+ logger .info (f"Running { args .k } evaluation runs for pass@k metrics" )
102+
103+ # Run k evaluation runs
104+ for run_idx in range (1 , args .k + 1 ):
105+ if args .k > 1 :
106+ logger .info (f"\n { '=' * 80 } " )
107+ logger .info (f"Starting Run { run_idx } /{ args .k } " )
108+ logger .info (f"{ '=' * 80 } \n " )
109+
110+ # For k-runs, create run-N subdirectory
111+ run_exp_name = f"run-{ run_idx } "
112+ run_output_dir = args .output_dir / args .exp_name
113+ else :
114+ # For single run (k=1), maintain backward compatibility
115+ # Use run-1 subdirectory for consistency
116+ run_exp_name = "run-1"
117+ run_output_dir = args .output_dir / args .exp_name
118+
119+ # Run evaluation for each model
120+ for i , model in enumerate (model_list , 1 ):
121+ logger .info (f"\n { '=' * 60 } " )
122+ if args .k > 1 :
123+ logger .info (f"Run { run_idx } /{ args .k } | Model { i } /{ len (model_list )} : { model } " )
124+ else :
125+ logger .info (f"Starting evaluation { i } /{ len (model_list )} : { model } " )
126+ logger .info (f"{ '=' * 60 } \n " )
127+
128+ # Initialize and run the evaluation pipeline for this model
129+ pipeline = MCPEvaluator (
130+ mcp_service = args .mcp ,
131+ model = model ,
132+ timeout = args .timeout ,
133+ exp_name = run_exp_name ,
134+ output_dir = run_output_dir ,
135+ )
136+
137+ pipeline .run_evaluation (args .tasks )
138+ logger .info (
139+ f"📁 Results: { pipeline .base_experiment_dir } "
140+ )
111141
112142 logger .info (f"\n { '=' * 60 } " )
113- logger .info (f"✓ All evaluations completed for { len (model_list )} model(s)" )
143+ if args .k > 1 :
144+ logger .info (f"✓ All { args .k } runs completed for { len (model_list )} model(s)" )
145+ logger .info (f"Run aggregate_results.py to compute pass@k metrics" )
146+ else :
147+ logger .info (f"✓ All evaluations completed for { len (model_list )} model(s)" )
114148 logger .info (f"{ '=' * 60 } " )
115149
116150
0 commit comments