Skip to content

Commit 4957c5f

Browse files
authored
✨ feat: support price cost calculating per model (#186)
1 parent d61caec commit 4957c5f

File tree

2 files changed

+166
-11
lines changed

2 files changed

+166
-11
lines changed

src/aggregators/aggregate_results.py

Lines changed: 87 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import sys
1818
sys.path.append(str(Path(__file__).parent.parent.parent))
1919
from src.errors import is_retryable_error
20+
from src.aggregators.pricing import compute_cost_usd
2021

2122

2223
def discover_tasks() -> Dict[str, List[str]]:
@@ -280,6 +281,12 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
280281
avg_pass1 = 0.0
281282
std_pass1 = 0.0
282283

284+
# Compute per-run tokens and cost
285+
per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0
286+
per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0
287+
model_for_pricing = actual_model_name or model
288+
computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)
289+
283290
overall_metrics = {
284291
"total_tasks": total_tasks,
285292
"total_agent_execution_time": total_agent_execution_time,
@@ -292,9 +299,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
292299
"avg_output_tokens": round(avg_output_tokens, 4),
293300
"avg_total_tokens": round(avg_total_tokens, 4),
294301
"avg_turns": round(avg_turns, 4),
295-
"per_run_input_tokens": total_input_tokens / runs_count if runs_count else 0,
296-
"per_run_output_tokens": total_output_tokens / runs_count if runs_count else 0,
297-
"per_run_cost": per_run_cost if per_run_cost is not None else None,
302+
"per_run_input_tokens": per_run_input_tokens,
303+
"per_run_output_tokens": per_run_output_tokens,
304+
"per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
298305
"actual_model_name": actual_model_name or "",
299306
"pass@1": {
300307
"avg": round(avg_pass1, 4),
@@ -386,6 +393,11 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
386393
s_mean = 0.0
387394
s_std = 0.0
388395

396+
# Compute per-run tokens and cost for this service
397+
s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0
398+
s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0
399+
s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens)
400+
389401
service_metrics = {
390402
"total_tasks": service_total_tasks,
391403
"total_agent_execution_time": s_total_agent_execution_time,
@@ -398,9 +410,9 @@ def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
398410
"avg_output_tokens": round(s_avg_output_tokens, 4),
399411
"avg_total_tokens": round(s_avg_total_tokens, 4),
400412
"avg_turns": round(s_avg_turns, 4),
401-
"per_run_input_tokens": s_total_input_tokens / runs_count if runs_count else 0,
402-
"per_run_output_tokens": s_total_output_tokens / runs_count if runs_count else 0,
403-
"per_run_cost": per_run_cost if per_run_cost is not None else None,
413+
"per_run_input_tokens": s_per_run_input_tokens,
414+
"per_run_output_tokens": s_per_run_output_tokens,
415+
"per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
404416
"actual_model_name": actual_model_name or "",
405417
"pass@1": {
406418
"avg": round(s_mean, 4),
@@ -484,14 +496,67 @@ def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict)
484496
for run_name, run_data in model_data[service].items():
485497
if task in run_data:
486498
meta = run_data[task]
499+
agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0)
500+
token_usage = meta.get("token_usage", {}) or {}
501+
turn_count = int(meta.get("turn_count", 0) or 0)
502+
success = bool(meta.get("execution_result", {}).get("success", False))
487503
model_task_data["runs"].append({
488504
"run": run_name,
489-
"success": meta.get("execution_result", {}).get("success", False),
490-
"execution_time": meta.get("agent_execution_time", 0),
491-
"token_usage": meta.get("token_usage", {})
505+
"success": success,
506+
"execution_time": agent_time,
507+
"agent_execution_time": agent_time,
508+
"token_usage": token_usage,
509+
"turn_count": turn_count,
492510
})
493511

494512
if model_task_data["runs"]:
513+
# Compute per-model summary across runs for this task
514+
runs_list = model_task_data["runs"]
515+
runs_count = len(runs_list)
516+
successful_runs = sum(1 for r in runs_list if r.get("success"))
517+
518+
# Averages
519+
total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list)
520+
avg_agent_time = round(total_agent_time / runs_count, 2)
521+
522+
def _tok(r, key):
523+
tu = r.get("token_usage") or {}
524+
return int(tu.get(key, 0) or 0)
525+
526+
total_input_tokens = 0
527+
total_output_tokens = 0
528+
total_total_tokens = 0
529+
for r in runs_list:
530+
in_tok = _tok(r, "input_tokens")
531+
out_tok = _tok(r, "output_tokens")
532+
ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok))
533+
total_input_tokens += in_tok
534+
total_output_tokens += out_tok
535+
total_total_tokens += ttl_tok
536+
537+
avg_input_tokens = round(total_input_tokens / runs_count, 1)
538+
avg_output_tokens = round(total_output_tokens / runs_count, 1)
539+
avg_total_tokens = round(total_total_tokens / runs_count, 1)
540+
541+
total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list)
542+
avg_turn_count = round(total_turns / runs_count, 2)
543+
544+
summary_obj = {
545+
"total_runs": runs_count,
546+
"successful_runs": successful_runs,
547+
"avg_agent_execution_time": avg_agent_time,
548+
"avg_input_tokens": avg_input_tokens,
549+
"avg_output_tokens": avg_output_tokens,
550+
"avg_total_tokens": avg_total_tokens,
551+
"avg_turn_count": avg_turn_count,
552+
}
553+
554+
# Include pass@k and pass^k only for multi-run models
555+
if runs_count > 1:
556+
summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0
557+
summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0
558+
559+
model_task_data["summary"] = summary_obj
495560
task_data["models"][model] = model_task_data
496561

497562
# Save task file
@@ -525,7 +590,9 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
525590
if include_k:
526591
header += f" Pass@{k} | Pass^{k} |"
527592
sep += "----------|----------|"
528-
# Add Avg Turns and Avg Agent Time (s) at the end
593+
# Add Per-Run Cost (USD) and Avg Agent Time (s) at the end
594+
header += " Per-Run Cost (USD) |"
595+
sep += "---------------------|"
529596
header += " Avg Agent Time (s) |"
530597
sep += "--------------------|"
531598

@@ -542,6 +609,14 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
542609
for model, metrics in sorted_items:
543610
pass1_avg, pass1_std = get_pass1_avg_std(metrics)
544611
avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0)
612+
# Format per-run cost (up to 2 decimal places, trim trailing zeros)
613+
cost_val = metrics.get("per_run_cost")
614+
if isinstance(cost_val, (int, float)):
615+
rounded_cost = round(float(cost_val), 2)
616+
formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.')
617+
cost_str = f"${formatted_cost}"
618+
else:
619+
cost_str = "/"
545620
row = (
546621
f"| {model} | {metrics.get('total_tasks', 0)} | "
547622
f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |"
@@ -552,7 +627,8 @@ def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
552627
else:
553628
# Single-run models do not have pass@k or pass^k; show placeholders
554629
row += " / | / |"
555-
# Append avg agent time at the end
630+
# Append cost and avg agent time at the end
631+
row += f" {cost_str} |"
556632
row += f" {avg_time:.1f} |"
557633
lines_sec.append(row)
558634

src/aggregators/pricing.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
Pricing utilities for computing per-run cost from token usage.
3+
4+
All prices are specified per 1,000,000 tokens (M tokens) in USD.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from typing import Dict, Optional
10+
11+
12+
# Price map keyed by canonical model name (lowercased)
13+
# Values are dicts with per-M token prices for input and output tokens
14+
MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = {
15+
# Use exact actual_model_name keys (lowercased) provided by the user
16+
# Anthropic
17+
"claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0},
18+
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
19+
20+
# DeepSeek
21+
"deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68},
22+
23+
# Google Gemini
24+
"gemini-2.5-pro": {"input": 2.5, "output": 15.0},
25+
"gemini-2.5-flash": {"input": 0.3, "output": 2.5},
26+
27+
# Z.AI
28+
"glm-4.5": {"input": 0.33, "output": 1.32},
29+
30+
# OpenAI
31+
"gpt-5-2025-08-07": {"input": 1.25, "output": 10.0},
32+
"gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0},
33+
"gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4},
34+
"gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0},
35+
"gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6},
36+
"gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4},
37+
"o3-2025-04-16": {"input": 2.0, "output": 8.0},
38+
"o4-mini-2025-04-16": {"input": 1.1, "output": 4.4},
39+
"gpt-oss-120b": {"input": 0.072, "output": 0.28},
40+
41+
# Qwen
42+
"qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},
43+
44+
# Xai
45+
"grok-4-0709": {"input": 3.0, "output": 15.0},
46+
"grok-code-fast-1": {"input": 0.2, "output": 1.5},
47+
48+
# Moonshot
49+
"kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
50+
}
51+
52+
53+
def normalize_model_name(model_name: str) -> str:
54+
"""Normalize model name for pricing lookup.
55+
56+
Lowercases only.
57+
"""
58+
return (model_name or "").strip().lower()
59+
60+
61+
def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]:
62+
"""Return per-M token prices for given model, or None if unknown."""
63+
key = normalize_model_name(model_name)
64+
return MODEL_PRICES_PER_M.get(key)
65+
66+
67+
def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]:
68+
"""Compute cost in USD given token usage and model pricing.
69+
70+
Prices are per 1,000,000 tokens. If pricing unknown, returns None.
71+
"""
72+
prices = get_price_per_m(model_name)
73+
if not prices:
74+
return None
75+
input_cost = (input_tokens / 1_000_000.0) * prices["input"]
76+
output_cost = (output_tokens / 1_000_000.0) * prices["output"]
77+
return float(round(input_cost + output_cost, 6))
78+
79+

0 commit comments

Comments
 (0)