1717import sys
1818sys .path .append (str (Path (__file__ ).parent .parent .parent ))
1919from src .errors import is_retryable_error
20+ from src .aggregators .pricing import compute_cost_usd
2021
2122
2223def discover_tasks () -> Dict [str , List [str ]]:
@@ -280,6 +281,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
280281 avg_pass1 = 0.0
281282 std_pass1 = 0.0
282283
284+ # Compute per-run tokens and cost
285+ per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0
286+ per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0
287+ model_for_pricing = actual_model_name or model
288+ computed_per_run_cost = compute_cost_usd (model_for_pricing , per_run_input_tokens , per_run_output_tokens )
289+
283290 overall_metrics = {
284291 "total_tasks" : total_tasks ,
285292 "total_agent_execution_time" : total_agent_execution_time ,
@@ -292,9 +299,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
292299 "avg_output_tokens" : round (avg_output_tokens , 4 ),
293300 "avg_total_tokens" : round (avg_total_tokens , 4 ),
294301 "avg_turns" : round (avg_turns , 4 ),
295- "per_run_input_tokens" : total_input_tokens / runs_count if runs_count else 0 ,
296- "per_run_output_tokens" : total_output_tokens / runs_count if runs_count else 0 ,
297- "per_run_cost" : per_run_cost if per_run_cost is not None else None ,
302+ "per_run_input_tokens" : per_run_input_tokens ,
303+ "per_run_output_tokens" : per_run_output_tokens ,
304+ "per_run_cost" : computed_per_run_cost if computed_per_run_cost is not None else ( per_run_cost if per_run_cost is not None else None ) ,
298305 "actual_model_name" : actual_model_name or "" ,
299306 "pass@1" : {
300307 "avg" : round (avg_pass1 , 4 ),
@@ -386,6 +393,11 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
386393 s_mean = 0.0
387394 s_std = 0.0
388395
396+ # Compute per-run tokens and cost for this service
397+ s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0
398+ s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0
399+ s_computed_per_run_cost = compute_cost_usd (model_for_pricing , s_per_run_input_tokens , s_per_run_output_tokens )
400+
389401 service_metrics = {
390402 "total_tasks" : service_total_tasks ,
391403 "total_agent_execution_time" : s_total_agent_execution_time ,
@@ -398,9 +410,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
398410 "avg_output_tokens" : round (s_avg_output_tokens , 4 ),
399411 "avg_total_tokens" : round (s_avg_total_tokens , 4 ),
400412 "avg_turns" : round (s_avg_turns , 4 ),
401- "per_run_input_tokens" : s_total_input_tokens / runs_count if runs_count else 0 ,
402- "per_run_output_tokens" : s_total_output_tokens / runs_count if runs_count else 0 ,
403- "per_run_cost" : per_run_cost if per_run_cost is not None else None ,
413+ "per_run_input_tokens" : s_per_run_input_tokens ,
414+ "per_run_output_tokens" : s_per_run_output_tokens ,
415+ "per_run_cost" : s_computed_per_run_cost if s_computed_per_run_cost is not None else ( per_run_cost if per_run_cost is not None else None ) ,
404416 "actual_model_name" : actual_model_name or "" ,
405417 "pass@1" : {
406418 "avg" : round (s_mean , 4 ),
@@ -484,14 +496,67 @@ def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict)
484496 for run_name , run_data in model_data [service ].items ():
485497 if task in run_data :
486498 meta = run_data [task ]
499+ agent_time = float (meta .get ("agent_execution_time" , 0.0 ) or 0.0 )
500+ token_usage = meta .get ("token_usage" , {}) or {}
501+ turn_count = int (meta .get ("turn_count" , 0 ) or 0 )
502+ success = bool (meta .get ("execution_result" , {}).get ("success" , False ))
487503 model_task_data ["runs" ].append ({
488504 "run" : run_name ,
489- "success" : meta .get ("execution_result" , {}).get ("success" , False ),
490- "execution_time" : meta .get ("agent_execution_time" , 0 ),
491- "token_usage" : meta .get ("token_usage" , {})
505+ "success" : success ,
506+ "execution_time" : agent_time ,
507+ "agent_execution_time" : agent_time ,
508+ "token_usage" : token_usage ,
509+ "turn_count" : turn_count ,
492510 })
493511
494512 if model_task_data ["runs" ]:
513+ # Compute per-model summary across runs for this task
514+ runs_list = model_task_data ["runs" ]
515+ runs_count = len (runs_list )
516+ successful_runs = sum (1 for r in runs_list if r .get ("success" ))
517+
518+ # Averages
519+ total_agent_time = sum (float (r .get ("agent_execution_time" , r .get ("execution_time" , 0.0 )) or 0.0 ) for r in runs_list )
520+ avg_agent_time = round (total_agent_time / runs_count , 2 )
521+
522+ def _tok (r , key ):
523+ tu = r .get ("token_usage" ) or {}
524+ return int (tu .get (key , 0 ) or 0 )
525+
526+ total_input_tokens = 0
527+ total_output_tokens = 0
528+ total_total_tokens = 0
529+ for r in runs_list :
530+ in_tok = _tok (r , "input_tokens" )
531+ out_tok = _tok (r , "output_tokens" )
532+ ttl_tok = int ((r .get ("token_usage" ) or {}).get ("total_tokens" , in_tok + out_tok ) or (in_tok + out_tok ))
533+ total_input_tokens += in_tok
534+ total_output_tokens += out_tok
535+ total_total_tokens += ttl_tok
536+
537+ avg_input_tokens = round (total_input_tokens / runs_count , 1 )
538+ avg_output_tokens = round (total_output_tokens / runs_count , 1 )
539+ avg_total_tokens = round (total_total_tokens / runs_count , 1 )
540+
541+ total_turns = sum (int (r .get ("turn_count" , 0 ) or 0 ) for r in runs_list )
542+ avg_turn_count = round (total_turns / runs_count , 2 )
543+
544+ summary_obj = {
545+ "total_runs" : runs_count ,
546+ "successful_runs" : successful_runs ,
547+ "avg_agent_execution_time" : avg_agent_time ,
548+ "avg_input_tokens" : avg_input_tokens ,
549+ "avg_output_tokens" : avg_output_tokens ,
550+ "avg_total_tokens" : avg_total_tokens ,
551+ "avg_turn_count" : avg_turn_count ,
552+ }
553+
554+ # Include pass@k and pass^k only for multi-run models
555+ if runs_count > 1 :
556+ summary_obj [f"pass@{ runs_count } " ] = 1.0 if successful_runs > 0 else 0.0
557+ summary_obj [f"pass^{ runs_count } " ] = 1.0 if successful_runs == runs_count else 0.0
558+
559+ model_task_data ["summary" ] = summary_obj
495560 task_data ["models" ][model ] = model_task_data
496561
497562 # Save task file
@@ -525,7 +590,9 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
525590 if include_k :
526591 header += f" Pass@{ k } | Pass^{ k } |"
527592 sep += "----------|----------|"
528- # Add Avg Turns and Avg Agent Time (s) at the end
593+ # Add Per-Run Cost (USD) and Avg Agent Time (s) at the end
594+ header += " Per-Run Cost (USD) |"
595+ sep += "---------------------|"
529596 header += " Avg Agent Time (s) |"
530597 sep += "--------------------|"
531598
@@ -542,6 +609,14 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
542609 for model , metrics in sorted_items :
543610 pass1_avg , pass1_std = get_pass1_avg_std (metrics )
544611 avg_time = float (metrics .get ("avg_agent_execution_time" , 0.0 ) or 0.0 )
612+ # Format per-run cost (up to 2 decimal places, trim trailing zeros)
613+ cost_val = metrics .get ("per_run_cost" )
614+ if isinstance (cost_val , (int , float )):
615+ rounded_cost = round (float (cost_val ), 2 )
616+ formatted_cost = f"{ rounded_cost :.2f} " .rstrip ('0' ).rstrip ('.' )
617+ cost_str = f"${ formatted_cost } "
618+ else :
619+ cost_str = "/"
545620 row = (
546621 f"| { model } | { metrics .get ('total_tasks' , 0 )} | "
547622 f"{ pass1_avg * 100 :.1f} % ± { pass1_std * 100 :.1f} % |"
@@ -552,7 +627,8 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
552627 else :
553628 # Single-run models do not have pass@k or pass^k; show placeholders
554629 row += " / | / |"
555- # Append avg agent time at the end
630+ # Append cost and avg agent time at the end
631+ row += f" { cost_str } |"
556632 row += f" { avg_time :.1f} |"
557633 lines_sec .append (row )
558634
0 commit comments