diff --git a/ROADMAP.md b/ROADMAP.md index cdbc725..1f9c026 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,6 +1,6 @@ # Deepr Roadmap -> Development priorities and planned features. Model/pricing notes updated through March 2026. +> Development priorities and planned features. Model/pricing notes updated through May 2026. ## Quick Links @@ -45,7 +45,7 @@ These features are well-tested and used regularly: - **Expert creation**: `expert make`, `expert chat`, `expert export/import` - **CLI output modes**: `--verbose`, `--json`, `--quiet`, `--explain` - **Context discovery**: `deepr search`, `--context ` for reusing prior research -- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus/Sonnet/Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o) +- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3.5 Flash, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus 4.7/4.6, Sonnet 4.6/4.5, Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o) - **Local storage**: SQLite persistence, markdown reports, expert profiles ### Experimental (Works but Evolving) @@ -64,7 +64,7 @@ These features work but APIs or behavior may change: ### What Works (Full List) -- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry) +- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.5 Flash/3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry) - Deep Research via OpenAI API (o3/o4-mini-deep-research) and Gemini Interactions API (Deep Research Agent) - Semantic commands (`research`, `learn`, `team`, `check`, `make`) - Expert system with autonomous learning, agentic chat (streaming, 27 slash commands, 4 chat modes, visible reasoning, context compaction, approval flows, expert council, task planning, memory commands), knowledge synthesis, curriculum preview (`expert plan`), domain-specific skills, AI-generated portraits @@ -214,8 +214,10 @@ See [docs/INTEGRATIONS.md](docs/INTEGRATIONS.md) for the full integration contra Goal: continuously validate routing quality/cost claims with measurable feedback. -- [ ] `deepr providers models` command (model discovery UX) +- [x] `deepr providers models` command (model discovery UX): live provider model lists diffed against the registry, scoped by default to newer versions of families already in use, with paste-ready registry stubs (`scripts/discover_models.py`) - [ ] Stale-model CI checks + provider-family alerting + - [x] `deepr eval` preflight warns when newer relevant models are missing from the registry + - [ ] Scheduled CI job that alerts on provider model drift - [ ] Routing preview: `deepr research --preview --auto` shows exact model choice, estimated cost, and confidence before executing - [ ] Eval methodology v2: - [ ] Citation quality, grounding, synthesis depth, temporal accuracy diff --git a/deepr/api/app.py b/deepr/api/app.py index 8acf048..6e19e29 100644 --- a/deepr/api/app.py +++ b/deepr/api/app.py @@ -667,8 +667,12 @@ def submit_job(): # Update status run_async(queue.update_status(job_id=job_id, status=JobStatus.PROCESSING, provider_job_id=provider_job_id)) - # Calculate cost estimate - avg_cost = 0.5 if "mini" in model else 5.0 + # Calculate cost estimate from the registry (source of truth). A prior + # name heuristic ("mini" -> $0.5 else $5.0) wildly misestimated nano / + # flash-lite (over) and deep-research (under) models. + from deepr.providers.registry import get_cost_estimate + + avg_cost = get_cost_estimate(model) estimated_cost = { "min_cost": avg_cost * 0.5, "max_cost": avg_cost * 2.0, diff --git a/deepr/cli/commands/providers.py b/deepr/cli/commands/providers.py index 5a1d6df..5757a08 100644 --- a/deepr/cli/commands/providers.py +++ b/deepr/cli/commands/providers.py @@ -460,6 +460,54 @@ def benchmark(quick: bool, target_provider: Optional[str], iterations: int, hist console.print(f"\n[green]Fastest:[/green] {best[0]} ({best[1]['avg_latency']:.0f}ms avg)") +@providers.command() +@click.option( + "--all", + "show_all", + is_flag=True, + help="Show every discovered model, not just newer versions of families already in the registry", +) +@click.option( + "--provider", + "target_provider", + type=click.Choice(["openai", "anthropic", "gemini", "xai", "azure-foundry"]), + help="Only check this provider", +) +@click.option("--json", "json_output", is_flag=True, help="Output as JSON (for CI / scripting)") +@click.option("--no-stubs", is_flag=True, help="Don't print suggested registry-entry stubs") +def models(show_all: bool, target_provider: Optional[str], json_output: bool, no_stubs: bool): + """Discover newer provider models missing from the registry. + + Queries each configured provider's live model list and flags newer versions + of model families already in the registry (e.g. a new mini/nano tier, or a + preview that has gone GA). Use --all to see every discovered model. + + Examples: + deepr providers models + deepr providers models --provider openai + deepr providers models --all + deepr providers models --json + """ + import subprocess + import sys + from pathlib import Path + + script = Path(__file__).resolve().parents[3] / "scripts" / "discover_models.py" + cmd = [sys.executable, str(script)] + if show_all: + cmd.append("--all") + if no_stubs: + cmd.append("--no-stubs") + if json_output: + cmd += ["--format", "json"] + if target_provider: + cmd += ["--provider", target_provider] + + result = subprocess.run(cmd) + if result.returncode != 0: + raise click.ClickException(f"Model discovery exited with status {result.returncode}") + + @providers.command() def list(): """List all available providers and models.""" diff --git a/deepr/providers/gemini_provider.py b/deepr/providers/gemini_provider.py index a92aab8..aaf6dab 100644 --- a/deepr/providers/gemini_provider.py +++ b/deepr/providers/gemini_provider.py @@ -147,8 +147,10 @@ def __init__( # Model mappings for convenience self.model_mappings = model_mappings or { + "gemini-3.5-flash": "gemini-3.5-flash", "gemini-3.1-pro-preview": "gemini-3.1-pro-preview", "gemini-3.1-pro": "gemini-3.1-pro-preview", + "gemini-3.1-flash-lite": "gemini-3.1-flash-lite", "gemini-3.1-flash-lite-preview": "gemini-3.1-flash-lite-preview", "gemini-3-pro-preview": "gemini-3-pro-preview", "gemini-2.5-pro": "gemini-2.5-pro", @@ -166,7 +168,9 @@ def __init__( from .registry import get_token_pricing self.pricing = { + "gemini-3.5-flash": get_token_pricing("gemini-3.5-flash"), "gemini-3.1-pro-preview": get_token_pricing("gemini-3.1-pro-preview"), + "gemini-3.1-flash-lite": get_token_pricing("gemini-3.1-flash-lite"), "gemini-3.1-flash-lite-preview": get_token_pricing("gemini-3.1-flash-lite-preview"), "gemini-3-pro-preview": get_token_pricing("gemini-3-pro-preview"), "gemini-2.5-pro": get_token_pricing("gemini-2.5-pro"), @@ -210,7 +214,10 @@ def _calculate_cost(self, input_tokens: int, output_tokens: int, model: str) -> return self.deep_research_cost_estimate base_model = model - for key in self.pricing: + # Match the longest pricing key first so e.g. "gemini-2.5-flash-lite" + # resolves to its own entry instead of the shorter "gemini-2.5-flash" + # prefix — which would charge Flash-Lite at ~5x the Flash rate. + for key in sorted(self.pricing, key=len, reverse=True): if key in model: base_model = key break diff --git a/deepr/providers/grok_provider.py b/deepr/providers/grok_provider.py index bf3fb3b..da71e55 100644 --- a/deepr/providers/grok_provider.py +++ b/deepr/providers/grok_provider.py @@ -72,6 +72,12 @@ def __init__( "grok-4.20-non-reasoning": "grok-4.20-0309-non-reasoning", "grok-4.20-multi-agent": "grok-4.20-multi-agent-0309", "grok-4.20": "grok-4.20-0309-non-reasoning", + # Hyphenated registry forms (registry keys use grok-4-20-*, not the + # dotted API form). Without these, a routed "grok-4-20-reasoning" + # falls through unmapped → wrong API id + ~11x cost undercharge. + "grok-4-20-reasoning": "grok-4.20-0309-reasoning", + "grok-4-20-non-reasoning": "grok-4.20-0309-non-reasoning", + "grok-4-20-multi-agent": "grok-4.20-multi-agent-0309", # Grok 4.1 Fast budget tier "grok-4-1-fast-reasoning": "grok-4-1-fast-reasoning", "grok-4-1-fast-non-reasoning": "grok-4-1-fast-non-reasoning", @@ -98,10 +104,15 @@ def __init__( _grok_4_1_fast = get_token_pricing("grok-4-1-fast-non-reasoning") self.pricing = { - # Grok 4.20 flagship ($2/$6 per MTok) + # Grok 4.20 flagship ($2/$6 per MTok). Keyed under the dotted API + # ids and the hyphenated registry forms so cost accounting is correct + # regardless of which name reaches _calculate_cost. "grok-4.20-0309-reasoning": {"input": 2.00, "output": 6.00}, "grok-4.20-0309-non-reasoning": {"input": 2.00, "output": 6.00}, "grok-4.20-multi-agent-0309": {"input": 2.00, "output": 6.00}, + "grok-4-20-reasoning": {"input": 2.00, "output": 6.00}, + "grok-4-20-non-reasoning": {"input": 2.00, "output": 6.00}, + "grok-4-20-multi-agent": {"input": 2.00, "output": 6.00}, # Grok 4.3 ($1.25/$2.50 per MTok). Listed under both the canonical # name and the hyphenated alias so any caller / older code path # that submits "grok-4-3" still gets accurate cost accounting. diff --git a/deepr/providers/registry.py b/deepr/providers/registry.py index 79d3242..3d34a44 100644 --- a/deepr/providers/registry.py +++ b/deepr/providers/registry.py @@ -67,6 +67,47 @@ class ModelCapability: input_cost_per_1m=30.00, output_cost_per_1m=180.00, ), + "openai/gpt-5.4-mini": ModelCapability( + provider="openai", + model="gpt-5.4-mini", + cost_per_query=0.05, + latency_ms=1500, + context_window=400_000, + specializations=["reasoning", "speed", "balanced", "agentic"], + strengths=[ + "Newer-generation budget reasoning (GPT-5.4 family)", + "Good reasoning at low cost", + "Fast responses", + "400K context window", + "Configurable reasoning effort", + ], + weaknesses=[ + "Pricier than gpt-5-mini ($0.75/$4.50 vs $0.25/$2.00 per MTok)", + "Less capable than full gpt-5.4", + ], + input_cost_per_1m=0.75, + output_cost_per_1m=4.50, + ), + "openai/gpt-5.4-nano": ModelCapability( + provider="openai", + model="gpt-5.4-nano", + cost_per_query=0.01, + latency_ms=800, + context_window=400_000, + specializations=["speed", "cost", "general", "summarization"], + strengths=[ + "Cheapest GPT-5.4 variant ($0.20/$1.25 per MTok)", + "Very fast responses", + "400K context window", + "Good for summarization and classification", + ], + weaknesses=[ + "Lowest reasoning capability in GPT-5.4 family", + "Pricier than gpt-5-nano ($0.20/$1.25 vs $0.05/$0.40 per MTok)", + ], + input_cost_per_1m=0.20, + output_cost_per_1m=1.25, + ), "openai/gpt-5.5": ModelCapability( provider="openai", model="gpt-5.5", @@ -525,6 +566,29 @@ class ModelCapability: successor="xai/grok-imagine-image", ), # Google Models (Gemini) + # Gemini 3.5 Flash — newest Flash generation (GA May 19, 2026, Google I/O 2026) + "gemini/gemini-3.5-flash": ModelCapability( + provider="gemini", + model="gemini-3.5-flash", + cost_per_query=0.03, + latency_ms=1500, + context_window=1_000_000, + specializations=["reasoning", "coding", "agentic", "multimodal", "speed", "thinking"], + strengths=[ + "First model in the Gemini 3.5 family (GA May 19, 2026)", + "Surpasses Gemini 3.1 Pro on coding, agentic, and multimodal benchmarks", + "Frontier intelligence at Flash speed (~4x faster output than frontier peers)", + "1M token context window, 65K output", + "Multimodal input (text, image, audio, video, PDF)", + "Dynamic thinking", + ], + weaknesses=[ + "3x pricier than Gemini 3 Flash preview ($1.50/$9.00 vs $0.50/$3.00 per MTok)", + "Thinking tokens add to output cost", + ], + input_cost_per_1m=1.50, + output_cost_per_1m=9.00, # Includes thinking tokens; non-global regions $1.65/$9.90 + ), "gemini/gemini-3-flash-preview": ModelCapability( provider="gemini", model="gemini-3-flash-preview", @@ -538,10 +602,34 @@ class ModelCapability: "1M token context window", "Dynamic thinking", ], - weaknesses=["Preview model (may change)", "Thinking tokens add to output cost"], + weaknesses=[ + "Preview model (may change)", + "Thinking tokens add to output cost", + "Superseded for quality by gemini-3.5-flash (which costs ~3x more)", + ], input_cost_per_1m=0.50, output_cost_per_1m=3.00, # Includes thinking tokens ), + # Gemini 3.1 Flash-Lite — GA (May 7, 2026); most cost-effective Gemini + "gemini/gemini-3.1-flash-lite": ModelCapability( + provider="gemini", + model="gemini-3.1-flash-lite", + cost_per_query=0.007, + latency_ms=1300, + context_window=1_000_000, + specializations=["speed", "cost", "general", "high_throughput", "thinking"], + strengths=[ + "Most cost-effective Gemini model (GA May 7, 2026)", + "1M token context window", + "Low-cost high-throughput inference", + "Dynamic thinking", + ], + weaknesses=[ + "Less capable than Pro/Flash models on deep reasoning", + ], + input_cost_per_1m=0.25, + output_cost_per_1m=1.50, + ), "gemini/gemini-3.1-flash-lite-preview": ModelCapability( provider="gemini", model="gemini-3.1-flash-lite-preview", @@ -557,6 +645,7 @@ class ModelCapability: weaknesses=[ "Preview model (lifecycle may change)", "Less capable than Pro models on deep reasoning", + "Superseded by GA gemini-3.1-flash-lite ($0.25/$1.50 per MTok)", ], input_cost_per_1m=0.20, output_cost_per_1m=1.20, @@ -676,6 +765,30 @@ class ModelCapability: # Note: Anthropic does NOT have a turnkey deep research API like OpenAI/Gemini. # Research capability is achieved via Extended Thinking + tool use + our orchestration. # For research, we recommend Opus 4.6 - best reasoning with Adaptive Thinking. + # Claude Opus 4.7 — most capable Claude (GA Apr 16, 2026); leads SWE-bench Pro + "anthropic/claude-opus-4-7": ModelCapability( + provider="anthropic", + model="claude-opus-4-7", + cost_per_query=0.85, # Same per-token rate as 4.6, but new tokenizer (~35% more tokens) + latency_ms=12000, + context_window=1_000_000, # Full 1M at standard pricing + specializations=["research", "reasoning", "coding", "analysis", "complex_tasks", "agents"], + strengths=[ + "Most capable Claude model (GA Apr 16, 2026)", + "Leads SWE-bench Pro (64.3%)", + "Adaptive Thinking (auto-adjusts reasoning effort)", + "Full 1M token context window at standard pricing", + "128K max output tokens", + "Fast mode available (6x price for faster output)", + ], + weaknesses=[ + "No native deep research API (requires orchestration)", + "New tokenizer uses up to 35% more tokens for the same text (higher effective cost)", + "Slower than Sonnet (~12s vs ~3s)", + ], + input_cost_per_1m=5.00, + output_cost_per_1m=25.00, + ), "anthropic/claude-opus-4-6": ModelCapability( provider="anthropic", model="claude-opus-4-6", @@ -695,10 +808,33 @@ class ModelCapability: "No native deep research API (requires orchestration)", "Slower than Sonnet (~12s vs ~3s)", "Higher cost than Sonnet (~$0.80 vs ~$0.48/query)", + "Superseded by claude-opus-4-7 (same price)", ], input_cost_per_1m=5.00, output_cost_per_1m=25.00, ), + # Claude Sonnet 4.6 — best value for everyday coding (GA Apr 2026) + "anthropic/claude-sonnet-4-6": ModelCapability( + provider="anthropic", + model="claude-sonnet-4-6", + cost_per_query=0.48, # Estimated with 16K thinking budget + latency_ms=3000, + context_window=1_000_000, # Full 1M at standard pricing + specializations=["reasoning", "coding", "analysis", "balanced", "agents"], + strengths=[ + "Best speed/intelligence balance; best value for everyday coding", + "Fast responses (~3s)", + "Extended Thinking support", + "Full 1M token context window at standard pricing", + "64K max output tokens", + ], + weaknesses=[ + "Less capable than Opus 4.7 for complex research", + "No native deep research API", + ], + input_cost_per_1m=3.00, + output_cost_per_1m=15.00, + ), "anthropic/claude-sonnet-4-5": ModelCapability( provider="anthropic", model="claude-sonnet-4-5", @@ -717,6 +853,7 @@ class ModelCapability: "Less capable than Opus 4.6 for complex research", "No Adaptive Thinking", "No native deep research API", + "Superseded by claude-sonnet-4-6 (same price)", ], input_cost_per_1m=3.00, output_cost_per_1m=15.00, @@ -974,20 +1111,29 @@ def get_cost_estimate(model: str, input_tokens: int | None = None) -> float: Estimated cost per query in USD. Returns 0.20 if model not found. """ resolved = _MODEL_ALIASES.get(model, model) + needle = _normalize_model_name(resolved) base = 0.20 + + # Exact match (normalized) first. for cap in MODEL_CAPABILITIES.values(): - if cap.model == resolved: + if _normalize_model_name(cap.model) == needle: base = cap.cost_per_query break else: - for cap in MODEL_CAPABILITIES.values(): - if cap.model in resolved: + # Partial match — longest cap.model first so e.g. a "gpt-5.4-mini" + # snapshot matches its own entry before the shorter "gpt-5.4" prefix. + # Without longest-first this both over-charges (mini -> full price) and, + # worse, under-charges ("gpt-5.4-pro-" -> cheaper "gpt-5.4"), + # letting budget pre-flight approve an expensive job. Mirrors + # get_token_pricing(). + for cap in sorted(MODEL_CAPABILITIES.values(), key=lambda c: len(c.model or ""), reverse=True): + if _normalize_model_name(cap.model) in needle: base = cap.cost_per_query break if input_tokens is not None: for tiered_model, (threshold, multiplier) in _TIERED_PRICING.items(): - if tiered_model in resolved and input_tokens > threshold: + if _normalize_model_name(tiered_model) in needle and input_tokens > threshold: return round(base * multiplier, 4) return base diff --git a/docs/MODELS.md b/docs/MODELS.md index cd5d155..cda104c 100644 --- a/docs/MODELS.md +++ b/docs/MODELS.md @@ -1,6 +1,6 @@ # Model Selection Guide -> **Note**: Model information current as of March 2026. AI models evolve rapidly — verify current pricing at provider websites. The [model registry](../deepr/providers/registry.py) is the source of truth. Run `python scripts/discover_models.py --show-registry` to see all registered models with pricing. +> **Note**: Model information current as of May 2026. AI models evolve rapidly — verify current pricing at provider websites. The [model registry](../deepr/providers/registry.py) is the source of truth. Run `deepr providers models` to diff the registry against each provider's live model list (flags newer versions of families you already use, with paste-ready registry stubs), or `python scripts/discover_models.py --show-registry` to see all registered models with pricing. ## Overview @@ -12,14 +12,15 @@ Deepr uses a hybrid approach optimizing for both quality and cost. Different tas ### OpenAI (`OPENAI_API_KEY`) - **Deep Research**: Turnkey async Deep Research API via Responses endpoint -- **Models**: o3-deep-research, o4-mini-deep-research, GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, GPT-4.1-mini +- **Models**: o3-deep-research, o4-mini-deep-research, GPT-5.5, GPT-5.4, GPT-5.4-pro, GPT-5.4-mini, GPT-5.4-nano, GPT-5-mini, GPT-4.1, GPT-4.1-mini - **Best for**: Deep research, planning, expert system (vector stores require OpenAI-compatible API) - **Note**: GPT-5.4 is the current mainline default; use GPT-5.4-pro for hardest tasks and GPT-5-mini for value ### Google Gemini (`GEMINI_API_KEY`) - **Deep Research**: Native Deep Research Agent via Interactions API (async background jobs) -- **Models**: Gemini 3.1 Pro Preview (default), Gemini 3 Flash, Gemini 2.5 Flash, Deep Research Agent (`deep-research-pro-preview-12-2025`) +- **Models**: Gemini 3.1 Pro Preview (default), Gemini 3.5 Flash, Gemini 3 Flash, Gemini 3.1 Flash-Lite (GA), Gemini 2.5 Flash, Deep Research Agent (`deep-research-pro-preview-12-2025`) - **Best for**: Large context windows (1M+ tokens), document analysis, cost-effective research, agentic workflows +- **Note**: Gemini 3.5 Flash (GA May 19, 2026, Google I/O 2026) is the newest Flash generation — it beats Gemini 3.1 Pro on coding/agentic/multimodal benchmarks at ~4x faster output, priced at $1.50/$9.00 per MTok ### xAI Grok (`XAI_API_KEY`) - **Deep Research**: Grok 4.20 Multi-Agent (4–16 parallel agents with autonomous tool use) @@ -67,9 +68,9 @@ The following legacy models will stop accepting API requests on **May 15, 2026 a ### Anthropic Claude (`ANTHROPIC_API_KEY`) - **Deep Research**: No turnkey API — uses Extended Thinking + tool use + web search orchestration -- **Models**: Claude Opus 4.6, Claude Sonnet 4.5, Claude Haiku 4.5 +- **Models**: Claude Opus 4.7, Claude Opus 4.6, Claude Sonnet 4.6, Claude Sonnet 4.5, Claude Haiku 4.5 - **Best for**: Complex reasoning with transparent thinking, coding tasks, nuanced analysis -- **Note**: Opus 4.6 (latest) recommended for research (~$0.80/query). All models support Extended Thinking. Requires a web search backend (Brave, Tavily, or DuckDuckGo) +- **Note**: Opus 4.7 (GA Apr 16, 2026; leads SWE-bench Pro at 64.3%) is the latest and recommended for research (~$0.85/query, $5/$25 per MTok — note its new tokenizer uses up to ~35% more tokens for the same text). Sonnet 4.6 ($3/$15) is the best-value coding model. Opus 4.7/4.6 and Sonnet 4.6 include the full 1M context window at standard pricing. All models support Extended Thinking. Requires a web search backend (Brave, Tavily, or DuckDuckGo) ### Azure OpenAI (`AZURE_OPENAI_KEY`) - **Models**: Same as OpenAI, deployed through Azure @@ -157,8 +158,9 @@ The agent automatically uses Bing web grounding to find and cite current sources | Quick Lookups | Grok 4.20 Non-Reasoning | see registry | ~1s | best value for freshness/citation tasks | | Latest News / Web | Grok 4.20 Non-Reasoning | see registry | ~1s | real-time web + strong value | | Large Documents | Gemini 3.1 Pro | $0.20* | ~40s | 1M token context, configurable thinking | -| Coding Tasks | Claude Sonnet 4.5 | $0.48 | ~3s | Best for code | -| Complex Reasoning | Claude Opus 4.6 | see registry | ~seconds | high-end complex reasoning | +| Fast Coding / Agentic | Gemini 3.5 Flash | ~$0.03 | ~1.5s | Beats 3.1 Pro on coding/agentic at Flash speed ($1.50/$9.00 per MTok) | +| Coding Tasks | Claude Sonnet 4.6 | $0.48 | ~3s | Best-value coding ($3/$15 per MTok) | +| Complex Reasoning | Claude Opus 4.7 | see registry | ~seconds | Most capable Claude; leads SWE-bench Pro | | Budget General | GPT-4.1-mini | $0.01 | ~1s | Cheapest OpenAI, 1M context | *\*Gemini 3.1 Pro has tiered pricing: $2/$12 per 1M tokens (input/output) for prompts ≤200K tokens, $4/$18 for prompts >200K tokens. The $0.20/query estimate assumes a typical sub-200K prompt. Large document analysis (250K+ tokens) costs roughly 2x more — e.g., a 500K-token corpus costs ~$2.27 vs ~$1.18 with sub-200K prompts. Use `--dry-run` to check before running.* diff --git a/scripts/benchmark_models.py b/scripts/benchmark_models.py index 5ecea71..99a76cf 100644 --- a/scripts/benchmark_models.py +++ b/scripts/benchmark_models.py @@ -636,11 +636,13 @@ class ModelSummary: # Frontier models "openai/gpt-5.5", # Newest OpenAI frontier (April 2026, 1M+ context) "openai/gpt-5.4", # Previous OpenAI frontier (1M+ context) - "anthropic/claude-opus-4-6", # Most capable Claude ($0.80/query) + "anthropic/claude-opus-4-7", # Most capable Claude — leads SWE-bench Pro ($0.85/query) + "anthropic/claude-opus-4-6", # Prior most-capable Claude ($0.80/query) "gemini/gemini-3.1-pro-preview", # Latest gen, best quality ($0.20/query) "gemini/gemini-2.5-pro", # Thinking model, can't disable thinking ($0.15/query) # Mid-tier - "anthropic/claude-sonnet-4-5", # Strong reasoning ($0.48/query) + "anthropic/claude-sonnet-4-6", # Best-value coding Claude ($0.48/query) + "anthropic/claude-sonnet-4-5", # Prior Sonnet ($0.48/query) "openai/gpt-4.1", # 1M context ($0.04/query) "openai/o3", # Reasoning model for complex tasks ($0.10/query) "openai/o4-mini", # Fast reasoning ($0.04/query) @@ -649,12 +651,16 @@ class ModelSummary: "xai/grok-4-20-reasoning", # xAI multi-agent workhorse ($0.10/query) "xai/grok-4-20-non-reasoning", # xAI flagship non-reasoning ($0.08/query) # Budget models + "openai/gpt-5.4-mini", # Newer-gen budget reasoning ($0.05/query) "openai/gpt-5-mini", # Budget reasoning ($0.03/query) "openai/gpt-4.1-mini", # Cheap 1M context ($0.01/query) + "openai/gpt-5.4-nano", # Cheapest GPT-5.4 ($0.01/query) "openai/gpt-5-nano", # Cheapest GPT-5 ($0.005/query) "openai/gpt-4.1-nano", # Cheapest 1M context ($0.003/query) - "gemini/gemini-3-flash-preview", # Newest gen, fast ($0.01/query) - "gemini/gemini-3.1-flash-lite-preview", # Lowest-cost Gemini 3.1 + "gemini/gemini-3.5-flash", # Newest Flash gen — beats 3.1 Pro on coding/agentic ($0.03/query) + "gemini/gemini-3-flash-preview", # Prior gen, fast ($0.01/query) + "gemini/gemini-3.1-flash-lite", # Most cost-effective Gemini, GA ($0.007/query) + "gemini/gemini-3.1-flash-lite-preview", # Prior preview of Flash-Lite "anthropic/claude-haiku-4-5", # Budget Anthropic ($0.05/query) ] @@ -674,7 +680,9 @@ class ModelSummary: "xai/grok-4-20-non-reasoning", # Gemini (Google grounding) "gemini/gemini-3.1-pro-preview", + "gemini/gemini-3.5-flash", "gemini/gemini-3-flash-preview", + "gemini/gemini-3.1-flash-lite", "gemini/gemini-3.1-flash-lite-preview", "gemini/gemini-2.5-flash", "gemini/gemini-2.5-pro", @@ -702,7 +710,7 @@ class ModelSummary: "xai/grok-4-20-reasoning", "xai/grok-4-20-non-reasoning", "xai/grok-4-1-fast-reasoning", - "xai/grok-4.3", + "xai/grok-4-3", ] # Documentation tier models: web-search-capable models that can fetch + document APIs. @@ -711,12 +719,14 @@ class ModelSummary: "openai/gpt-5-mini", "openai/o3", "gemini/gemini-3.1-pro-preview", + "gemini/gemini-3.5-flash", "gemini/gemini-2.5-pro", + "gemini/gemini-3.1-flash-lite", "gemini/gemini-3.1-flash-lite-preview", "xai/grok-4-20-reasoning", "xai/grok-4-20-non-reasoning", "xai/grok-4-1-fast-reasoning", - "xai/grok-4.3", + "xai/grok-4-3", ] # Provider → (env var, API base URL) @@ -753,6 +763,38 @@ def load_registry(): return mod.MODEL_CAPABILITIES +def warn_if_newer_models_available() -> None: + """Best-effort preflight: warn if providers expose newer models than the + registry knows about. + + Reuses scripts/discover_models.py so the "newer version of a family we use" + logic lives in one place. Never raises — discovery is networked and optional, + and must not block a benchmark. + """ + try: + spec = importlib.util.spec_from_file_location( + "discover_models", PROJECT_ROOT / "scripts" / "discover_models.py" + ) + dm = importlib.util.module_from_spec(spec) + spec.loader.exec_module(dm) + + reg = dm.load_registry() + discovered = dm.discover_via_api() + if not discovered: + return + report = dm.compare_registry(reg, discovered) + relevant, _ = dm.classify_new_models(report["new_models"], reg) + if relevant: + names = ", ".join(f"{m.provider}/{m.model_id}" for m in relevant[:6]) + extra = "" if len(relevant) <= 6 else f" (+{len(relevant) - 6} more)" + print(f" NOTE: {len(relevant)} newer model(s) available but not in the registry: {names}{extra}") + print(" Run 'deepr providers models' to review them and get registry stubs.") + print() + except Exception: + # Discovery is best-effort; never block a benchmark on it. + pass + + # ─── Preflight ──────────────────────────────────────────────────────────────── @@ -2814,6 +2856,11 @@ def main(): help="Alias for --fill-gaps (run only newly added model+tier combos).", ) parser.add_argument("--workers", type=int, default=5, help="Parallel eval workers (default: 5)") + parser.add_argument( + "--skip-discovery-check", + action="store_true", + help="Skip the preflight check for newer provider models not in the registry", + ) parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging") args = parser.parse_args() @@ -2894,6 +2941,9 @@ def main(): print_preflight(key_status, all_models, est_total) + if not args.skip_discovery_check: + warn_if_newer_models_available() + if (not args.dry_run) and args.max_estimated_cost is not None and est_total > args.max_estimated_cost: print( f"\n ABORT: estimated cost ${est_total:.2f} exceeds --max-estimated-cost ${args.max_estimated_cost:.2f}." @@ -2958,6 +3008,12 @@ def main(): all_results = compute_combined_scores(all_results, use_judge) + # Snapshot the results actually executed this run, BEFORE merging prior + # history. The merge below adds historical evals (for richer rankings), + # but those incurred no new spend — so the reported cost must come from + # this snapshot, not the merged dataset. + executed_results = list(all_results) + # ─── Merge prior saved results (--fill-gaps) ───────────────────────── if args.fill_gaps and prior_saved: # Convert prior saved results to EvalResult objects for merged reporting @@ -2993,8 +3049,14 @@ def main(): # ─── Phase 4: Report ───────────────────────────────────────────────── summaries = build_summaries(all_results, registry) - # Calculate actual cost - total_cost = sum(s.total_cost for s in summaries) + # Cost reflects only evals executed this run. When --fill-gaps merges + # prior history into the rankings, summing every summary's cost would + # report the (huge) cost of re-running the entire dataset rather than + # this run's actual spend. + if len(executed_results) != len(all_results): + total_cost = sum(s.total_cost for s in build_summaries(executed_results, registry)) + else: + total_cost = sum(s.total_cost for s in summaries) # Save results first (before report printing, which can fail on encoding) if args.save: diff --git a/scripts/discover_models.py b/scripts/discover_models.py index 492b511..dbc0325 100644 --- a/scripts/discover_models.py +++ b/scripts/discover_models.py @@ -33,6 +33,7 @@ import json import logging import os +import re import sys from dataclasses import asdict, dataclass from pathlib import Path @@ -41,6 +42,15 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) +# Load .env so keys configured there (not just exported in the shell) are +# visible — otherwise discovery silently skips providers like Anthropic/Azure. +try: + from dotenv import load_dotenv + + load_dotenv(PROJECT_ROOT / ".env") +except ImportError: + pass + logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s") logger = logging.getLogger(__name__) @@ -536,6 +546,31 @@ def _parse_llm_response(text: str) -> list[DiscoveredModel]: # ─── Comparison ─────────────────────────────────────────────────────────────── +_DATE_SUFFIX_RE = re.compile(r"-\d{4}-\d{2}-\d{2}\b") # -2026-03-05 +_DATE_COMPACT_RE = re.compile(r"-\d{8}\b") # -20251001 +_SNAPSHOT_RE = re.compile(r"-\d{4}\b(?=-|$)") # -0309 (xAI dated builds) + + +def _canonical_model(model_id: str) -> str: + """Reduce an API model ID to a registry-comparable canonical form. + + Providers expose models with naming variations that exact-match comparison + treats as entirely different models, producing huge false-positive diffs: + - dot vs hyphen: grok-4.3 ↔ grok-4-3 + - trailing date stamps: claude-haiku-4-5-20251001 ↔ claude-haiku-4-5 + gpt-5.4-2026-03-05 ↔ gpt-5.4 + - embedded build dates: grok-4.20-0309-reasoning ↔ grok-4-20-reasoning + + Normalizing both sides before comparison collapses these to the real model. + """ + s = model_id.lower().replace(".", "-") + s = _DATE_SUFFIX_RE.sub("", s) + s = _DATE_COMPACT_RE.sub("", s) + s = _SNAPSHOT_RE.sub("", s) + s = re.sub(r"-{2,}", "-", s).strip("-") + return s + + def compare_registry( registry: dict[str, RegistryModel], discovered: list[DiscoveredModel], @@ -547,21 +582,29 @@ def compare_registry( - pricing_changes: models where pricing differs - in_registry: models that match - registry_only: models in registry but not discovered + + Matching is done on a canonicalized (provider, model) key so naming + variations (dot/hyphen, date snapshots) don't masquerade as new models. """ - # Build lookup from discovered models - discovered_lookup: dict[str, DiscoveredModel] = {} + # Canonical registry index: (provider, canonical_model) -> RegistryModel + registry_by_canon: dict[tuple[str, str], RegistryModel] = {} + for rm in registry.values(): + registry_by_canon[(rm.provider, _canonical_model(rm.model))] = rm + + # Deduplicate discovered models that canonicalize to the same key (e.g. a + # dated snapshot plus its alias) so we don't report the same model twice. + discovered_by_canon: dict[tuple[str, str], DiscoveredModel] = {} for m in discovered: - # Normalize key: provider/model_id - key = f"{m.provider}/{m.model_id}" - discovered_lookup[key] = m + discovered_by_canon.setdefault((m.provider, _canonical_model(m.model_id)), m) new_models = [] pricing_changes = [] in_registry = [] - for key, dm in discovered_lookup.items(): - if key in registry: - rm = registry[key] + for canon_key, dm in discovered_by_canon.items(): + rm = registry_by_canon.get(canon_key) + if rm is not None: + key = f"{rm.provider}/{rm.model}" in_registry.append({"key": key, "discovered": dm, "registry": rm}) # Check pricing changes (only if LLM provided pricing) @@ -579,14 +622,11 @@ def compare_registry( } ) else: - # Check if any registry key contains this model ID (partial match) - partial = [k for k in registry if dm.model_id in k] - if not partial: - new_models.append(dm) + new_models.append(dm) - # Models in registry but not discovered - discovered_model_ids = {m.model_id for m in discovered} - registry_only = [rm for rm in registry.values() if rm.model not in discovered_model_ids] + # Models in registry but not discovered (canonical comparison) + discovered_canon = set(discovered_by_canon.keys()) + registry_only = [rm for rm in registry.values() if (rm.provider, _canonical_model(rm.model)) not in discovered_canon] return { "new_models": new_models, @@ -628,18 +668,152 @@ def print_registry_table(registry: dict[str, RegistryModel]): print(f" Total: {len(registry)} models\n") -def print_comparison_report(report: dict): - """Print a human-readable comparison report.""" +# Non-text / non-research model families we never want to surface as routing +# candidates (image, audio, embeddings, speech, robotics, etc.). +_NON_TEXT_MARKERS = ( + "tts", + "transcribe", + "diarize", + "embedding", + "image", + "audio", + "realtime", + "robotics", + "computer-use", + "search-preview", + "whisper", + "dall-e", + "moderation", +) + +_VERSION_TOKEN_RE = re.compile(r"^\d+(?:\.\d+)?$") +_DATEISH_TOKEN_RE = re.compile(r"^\d{4,8}$") +_QUALIFIER_TOKENS = {"preview", "latest", "stable", "exp", "experimental"} + + +def _is_text_model(model_id: str) -> bool: + """True unless the model is an image/audio/embedding/etc. variant.""" + mid = model_id.lower() + return not any(marker in mid for marker in _NON_TEXT_MARKERS) + + +def _family_signature(model_id: str) -> str: + """Drop version/date/qualifier tokens so variants of one family share a key. + + gpt-5.4-mini and gpt-5-mini both -> 'gpt-mini'; claude-opus-4-7 -> 'claude-opus'. + """ + parts = [ + tok + for tok in model_id.lower().split("-") + if not (_VERSION_TOKEN_RE.match(tok) or _DATEISH_TOKEN_RE.match(tok) or tok in _QUALIFIER_TOKENS) + ] + return "-".join(parts) + + +def _version_key(model_id: str) -> tuple[float, ...]: + """Comparable numeric version (gpt-5.4-mini -> (5.4,), claude-opus-4-7 -> (4.7,)).""" + nums = re.findall(r"\d+(?:\.\d+)?", model_id.lower().replace("-", ".")) + return tuple(float(n) for n in nums) or (0.0,) + + +def classify_new_models( + new_models: list[DiscoveredModel], + registry: dict[str, RegistryModel], +) -> tuple[list[DiscoveredModel], list[DiscoveredModel]]: + """Split discovered-but-unregistered models into (relevant, other). + + Relevant = a text model sharing a (provider, family) with the registry, that + is either a higher version than what we have, or the GA release of a family + we currently only have in preview. Everything else (brand-new families, + older versions, non-text variants) goes to 'other'. + """ + fam_max: dict[tuple[str, str], tuple[float, ...]] = {} + reg_canon: set[tuple[str, str]] = set() + reg_preview_bases: set[tuple[str, str]] = set() + for rm in registry.values(): + fam = (rm.provider, _family_signature(rm.model)) + v = _version_key(rm.model) + fam_max[fam] = max(v, fam_max.get(fam, v)) + canon = _canonical_model(rm.model) + reg_canon.add((rm.provider, canon)) + if canon.endswith("-preview"): + reg_preview_bases.add((rm.provider, canon[: -len("-preview")])) + + relevant, other = [], [] + for m in new_models: + if not _is_text_model(m.model_id): + other.append(m) + continue + mc = _canonical_model(m.model_id) + # GA promotion: registry has -preview but not the GA . + is_ga_promotion = ( + "preview" not in mc and (m.provider, mc) in reg_preview_bases and (m.provider, mc) not in reg_canon + ) + fam = (m.provider, _family_signature(m.model_id)) + is_newer = fam in fam_max and _version_key(m.model_id) > fam_max[fam] + if is_newer or is_ga_promotion: + relevant.append(m) + else: + other.append(m) + return relevant, other + + +def _registry_stub(m: DiscoveredModel) -> str: + """Render a ready-to-paste ModelCapability stub for a discovered model. + + Pricing (which provider APIs don't expose) and tuning fields are left as + TODOs to fill from the provider's pricing page before the entry goes live — + wrong pricing silently corrupts budget enforcement and the cost ledger. + """ + cw = m.context_window or 0 + return ( + f' "{m.provider}/{m.model_id}": ModelCapability(\n' + f' provider="{m.provider}",\n' + f' model="{m.model_id}",\n' + f" cost_per_query=0.0, # TODO: estimate per-query cost\n" + f" latency_ms=2000, # TODO: measure\n" + f" context_window={cw if cw else 'TODO'},\n" + f" specializations=[], # TODO\n" + f" strengths=[], # TODO\n" + f" weaknesses=[],\n" + f" input_cost_per_1m=0.0, # TODO: from provider pricing page\n" + f" output_cost_per_1m=0.0, # TODO: from provider pricing page\n" + f" )," + ) + + +def print_comparison_report( + report: dict, + registry: dict[str, RegistryModel] | None = None, + show_all: bool = False, + emit_stubs: bool = True, +): + """Print a human-readable comparison report. + + By default (when ``registry`` is provided and ``show_all`` is False), only + "relevant" new models are highlighted: newer versions of model families we + already use. Pass ``show_all=True`` to list every discovered-but-unregistered + model (including older versions, non-text variants, and brand-new families). + """ new_models = report["new_models"] pricing_changes = report["pricing_changes"] in_registry = report["in_registry"] registry_only = report["registry_only"] + if registry is not None and not show_all: + relevant, other = classify_new_models(new_models, registry) + highlighted = relevant + other_count = len(other) + else: + highlighted = new_models + other_count = 0 + # New models - if new_models: - print(f"\n NEW MODELS AVAILABLE ({len(new_models)}):") + if highlighted: + label = "NEW RELEVANT MODELS" if (registry is not None and not show_all) else "NEW MODELS AVAILABLE" + print(f"\n {label} ({len(highlighted)}):") print(" " + "─" * 66) - for m in new_models: + for m in highlighted: pricing = "" if m.input_cost_per_1m > 0: pricing = f" ${m.input_cost_per_1m:.2f}/${m.output_cost_per_1m:.2f} per MTok" @@ -648,6 +822,9 @@ def print_comparison_report(report: dict): print(f" + {m.provider}/{m.model_id}{pricing}{ctx}{notes}") print() + if other_count: + print(f" ({other_count} other discovered models hidden — older versions, new families, or non-text. Use --all.)\n") + # Pricing changes if pricing_changes: print(f"\n PRICING CHANGES ({len(pricing_changes)}):") @@ -661,7 +838,11 @@ def print_comparison_report(report: dict): # Summary print(" SUMMARY:") print(f" In registry: {len(in_registry)}") - print(f" New available: {len(new_models)}") + if registry is not None and not show_all: + print(f" New relevant: {len(highlighted)}") + print(f" Other (hidden): {other_count}") + else: + print(f" New available: {len(new_models)}") print(f" Pricing changes: {len(pricing_changes)}") print(f" Registry-only: {len(registry_only)}") @@ -670,7 +851,14 @@ def print_comparison_report(report: dict): for rm in sorted(registry_only, key=lambda x: x.key): print(f" ? {rm.key}") - if not new_models and not pricing_changes: + # Draft registry stubs for the relevant new models. + if emit_stubs and highlighted: + print("\n SUGGESTED REGISTRY ENTRIES (fill in TODOs, then paste into registry.py):") + print(" " + "─" * 66) + for m in highlighted: + print(_registry_stub(m)) + + if not highlighted and not pricing_changes: print("\n Registry is up to date!") @@ -683,11 +871,26 @@ def _format_context(tokens: int) -> str: return str(tokens) -def output_json(registry: dict[str, RegistryModel], report: dict): - """Output report as JSON.""" +def output_json( + registry: dict[str, RegistryModel], + report: dict, + registry_obj: dict[str, RegistryModel] | None = None, + show_all: bool = False, +): + """Output report as JSON. + + When a registry is supplied and ``show_all`` is False, ``new_models`` holds + only the relevant (newer-version-of-a-known-family) models and + ``new_models_other`` holds the rest, so CI can alert on the relevant set. + """ + if registry_obj is not None and not show_all: + relevant, other = classify_new_models(report["new_models"], registry_obj) + else: + relevant, other = report["new_models"], [] result = { "registry_count": len(registry), - "new_models": [asdict(m) for m in report["new_models"]], + "new_models": [asdict(m) for m in relevant], + "new_models_other": [asdict(m) for m in other], "pricing_changes": report["pricing_changes"], "matched": len(report["in_registry"]), "registry_only": [asdict(m) for m in report["registry_only"]], @@ -699,6 +902,15 @@ def output_json(registry: dict[str, RegistryModel], report: dict): def main(): + # The report uses box-drawing characters (U+2500 etc.). On Windows the + # default console encoding (cp1252) can't encode them and the script + # crashes mid-report. Force UTF-8 output where the stream supports it. + if hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except (ValueError, OSError): + pass + parser = argparse.ArgumentParser( description="Discover available AI models and compare against Deepr's registry.", formatter_class=argparse.RawDescriptionHelpFormatter, @@ -739,6 +951,16 @@ def main(): action="store_true", help="Show current registry and exit", ) + parser.add_argument( + "--all", + action="store_true", + help="Show every discovered model (default: only newer versions of families already in the registry)", + ) + parser.add_argument( + "--no-stubs", + action="store_true", + help="Do not print suggested registry entry stubs for new relevant models", + ) parser.add_argument( "--verbose", "-v", @@ -791,9 +1013,9 @@ def main(): # Output if args.format == "json": - output_json(registry, report) + output_json(registry, report, registry_obj=registry, show_all=args.all) else: - print_comparison_report(report) + print_comparison_report(report, registry=registry, show_all=args.all, emit_stubs=not args.no_stubs) if __name__ == "__main__": diff --git a/tests/unit/test_providers/test_grok_provider.py b/tests/unit/test_providers/test_grok_provider.py index e1ccb9c..89df1b0 100644 --- a/tests/unit/test_providers/test_grok_provider.py +++ b/tests/unit/test_providers/test_grok_provider.py @@ -380,3 +380,31 @@ async def test_cancel_job(self, provider): # but the method should still be callable result = await provider.cancel_job(job_id) assert isinstance(result, bool) + + +class TestGrokHyphenatedRegistryForms: + """Regression: registry keys use hyphenated grok-4-20-* but the provider + mappings/pricing were keyed only by the dotted API form. A routed + "grok-4-20-reasoning" then went unmapped (wrong API id) and fell to the + grok-4-1-fast default price (~11x undercharge: $0.7 vs $8.0 per 1M/1M). + """ + + @pytest.fixture + def provider(self): + return GrokProvider(api_key="test-xai-key") + + def test_hyphenated_forms_map_to_api_ids(self, provider): + assert provider.get_model_name("grok-4-20-reasoning") == "grok-4.20-0309-reasoning" + assert provider.get_model_name("grok-4-20-non-reasoning") == "grok-4.20-0309-non-reasoning" + assert provider.get_model_name("grok-4-20-multi-agent") == "grok-4.20-multi-agent-0309" + + def test_hyphenated_forms_priced_at_flagship_rate(self, provider): + # $2/$6 per MTok -> $8.0 for 1M in + 1M out; NOT the $0.70 fast default + for model in ("grok-4-20-reasoning", "grok-4-20-non-reasoning", "grok-4-20-multi-agent"): + cost = provider._calculate_cost(1_000_000, 1_000_000, model) + assert cost == 8.0, f"{model} mispriced: {cost}" + + def test_dotted_and_hyphenated_prices_match(self, provider): + assert provider._calculate_cost(1_000_000, 1_000_000, "grok-4-20-reasoning") == provider._calculate_cost( + 1_000_000, 1_000_000, "grok-4.20-0309-reasoning" + ) diff --git a/tests/unit/test_providers/test_registry.py b/tests/unit/test_providers/test_registry.py index 87f69bc..6935b45 100644 --- a/tests/unit/test_providers/test_registry.py +++ b/tests/unit/test_providers/test_registry.py @@ -6,6 +6,7 @@ MODEL_CAPABILITIES, ModelCapability, get_cheapest_model, + get_cost_estimate, get_fastest_model, get_largest_context_model, get_model_capability, @@ -175,5 +176,42 @@ def test_cost_latency_tradeoff(self): assert deep_research.latency_ms > fastest.latency_ms +class TestCostEstimateMatching: + """Regression tests: get_cost_estimate must resolve the most specific model. + + A prior first-substring-match fallback resolved snapshot/variant strings to + the shorter, wrong family member — over-charging (mini -> full price) and, + worse, under-charging (gpt-5.4-pro- -> cheaper gpt-5.4), which lets + budget pre-flight approve an expensive job against an underestimate. + """ + + def test_exact_keys(self): + assert get_cost_estimate("gpt-5.4") == 0.30 + assert get_cost_estimate("gpt-5.4-mini") == 0.05 + assert get_cost_estimate("gpt-5.4-nano") == 0.01 + + def test_snapshot_resolves_to_specific_model_not_prefix(self): + # mini/nano snapshots must NOT resolve to the more expensive gpt-5.4 + assert get_cost_estimate("gpt-5.4-mini-2026-03-17") == 0.05 + assert get_cost_estimate("gpt-5.4-nano-2026-03-17") == 0.01 + + def test_pro_snapshot_does_not_underestimate(self): + # The dangerous direction: must resolve to the pro price, not gpt-5.4 + assert get_cost_estimate("gpt-5.4-pro-2026-03-05") == 0.90 + assert get_cost_estimate("gpt-5.5-pro-2026-04-23") == 1.50 + + def test_dotted_and_hyphenated_grok_equivalent(self): + # Normalization: dotted API form and hyphenated registry form match + assert get_cost_estimate("grok-4.3") == get_cost_estimate("grok-4-3") + + def test_tiered_pricing_preserved(self): + base = get_cost_estimate("gemini-3.1-pro-preview") + tiered = get_cost_estimate("gemini-3.1-pro-preview", input_tokens=300_000) + assert tiered == round(base * 2.0, 4) + + def test_unknown_model_returns_default(self): + assert get_cost_estimate("totally-made-up-model-xyz") == 0.20 + + if __name__ == "__main__": pytest.main([__file__, "-v"])