blisspixel · blisspixel · May 23, 2026 · May 23, 2026
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -1,6 +1,6 @@
 # Deepr Roadmap
 
-> Development priorities and planned features. Model/pricing notes updated through March 2026.
+> Development priorities and planned features. Model/pricing notes updated through May 2026.
 
 ## Quick Links
 
@@ -45,7 +45,7 @@ These features are well-tested and used regularly:
 - **Expert creation**: `expert make`, `expert chat`, `expert export/import`
 - **CLI output modes**: `--verbose`, `--json`, `--quiet`, `--explain`
 - **Context discovery**: `deepr search`, `--context <id>` for reusing prior research
-- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus/Sonnet/Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o)
+- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3.5 Flash, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus 4.7/4.6, Sonnet 4.6/4.5, Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o)
 - **Local storage**: SQLite persistence, markdown reports, expert profiles
 
 ### Experimental (Works but Evolving)
@@ -64,7 +64,7 @@ These features work but APIs or behavior may change:
 
 ### What Works (Full List)
 
-- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry)
+- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.5 Flash/3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry)
 - Deep Research via OpenAI API (o3/o4-mini-deep-research) and Gemini Interactions API (Deep Research Agent)
 - Semantic commands (`research`, `learn`, `team`, `check`, `make`)
 - Expert system with autonomous learning, agentic chat (streaming, 27 slash commands, 4 chat modes, visible reasoning, context compaction, approval flows, expert council, task planning, memory commands), knowledge synthesis, curriculum preview (`expert plan`), domain-specific skills, AI-generated portraits
@@ -214,8 +214,10 @@ See [docs/INTEGRATIONS.md](docs/INTEGRATIONS.md) for the full integration contra
 
 Goal: continuously validate routing quality/cost claims with measurable feedback.
 
-- [ ] `deepr providers models` command (model discovery UX)
+- [x] `deepr providers models` command (model discovery UX): live provider model lists diffed against the registry, scoped by default to newer versions of families already in use, with paste-ready registry stubs (`scripts/discover_models.py`)
 - [ ] Stale-model CI checks + provider-family alerting
+  - [x] `deepr eval` preflight warns when newer relevant models are missing from the registry
+  - [ ] Scheduled CI job that alerts on provider model drift
 - [ ] Routing preview: `deepr research --preview --auto` shows exact model choice, estimated cost, and confidence before executing
 - [ ] Eval methodology v2:
   - [ ] Citation quality, grounding, synthesis depth, temporal accuracy

diff --git a/deepr/api/app.py b/deepr/api/app.py
@@ -667,8 +667,12 @@ def submit_job():
     # Update status
     run_async(queue.update_status(job_id=job_id, status=JobStatus.PROCESSING, provider_job_id=provider_job_id))
 
-    # Calculate cost estimate
-    avg_cost = 0.5 if "mini" in model else 5.0
+    # Calculate cost estimate from the registry (source of truth). A prior
+    # name heuristic ("mini" -> $0.5 else $5.0) wildly misestimated nano /
+    # flash-lite (over) and deep-research (under) models.
+    from deepr.providers.registry import get_cost_estimate
+
+    avg_cost = get_cost_estimate(model)
     estimated_cost = {
         "min_cost": avg_cost * 0.5,
         "max_cost": avg_cost * 2.0,

diff --git a/deepr/cli/commands/providers.py b/deepr/cli/commands/providers.py
@@ -460,6 +460,54 @@ def benchmark(quick: bool, target_provider: Optional[str], iterations: int, hist
         console.print(f"\n[green]Fastest:[/green] {best[0]} ({best[1]['avg_latency']:.0f}ms avg)")
 
 
+@providers.command()
+@click.option(
+    "--all",
+    "show_all",
+    is_flag=True,
+    help="Show every discovered model, not just newer versions of families already in the registry",
+)
+@click.option(
+    "--provider",
+    "target_provider",
+    type=click.Choice(["openai", "anthropic", "gemini", "xai", "azure-foundry"]),
+    help="Only check this provider",
+)
+@click.option("--json", "json_output", is_flag=True, help="Output as JSON (for CI / scripting)")
+@click.option("--no-stubs", is_flag=True, help="Don't print suggested registry-entry stubs")
+def models(show_all: bool, target_provider: Optional[str], json_output: bool, no_stubs: bool):
+    """Discover newer provider models missing from the registry.
+
+    Queries each configured provider's live model list and flags newer versions
+    of model families already in the registry (e.g. a new mini/nano tier, or a
+    preview that has gone GA). Use --all to see every discovered model.
+
+    Examples:
+        deepr providers models
+        deepr providers models --provider openai
+        deepr providers models --all
+        deepr providers models --json
+    """
+    import subprocess
+    import sys
+    from pathlib import Path
+
+    script = Path(__file__).resolve().parents[3] / "scripts" / "discover_models.py"
+    cmd = [sys.executable, str(script)]
+    if show_all:
+        cmd.append("--all")
+    if no_stubs:
+        cmd.append("--no-stubs")
+    if json_output:
+        cmd += ["--format", "json"]
+    if target_provider:
+        cmd += ["--provider", target_provider]
+
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        raise click.ClickException(f"Model discovery exited with status {result.returncode}")
+
+
 @providers.command()
 def list():
     """List all available providers and models."""

diff --git a/deepr/providers/gemini_provider.py b/deepr/providers/gemini_provider.py
@@ -147,8 +147,10 @@ def __init__(
 
         # Model mappings for convenience
         self.model_mappings = model_mappings or {
+            "gemini-3.5-flash": "gemini-3.5-flash",
             "gemini-3.1-pro-preview": "gemini-3.1-pro-preview",
             "gemini-3.1-pro": "gemini-3.1-pro-preview",
+            "gemini-3.1-flash-lite": "gemini-3.1-flash-lite",
             "gemini-3.1-flash-lite-preview": "gemini-3.1-flash-lite-preview",
             "gemini-3-pro-preview": "gemini-3-pro-preview",
             "gemini-2.5-pro": "gemini-2.5-pro",
@@ -166,7 +168,9 @@ def __init__(
         from .registry import get_token_pricing
 
         self.pricing = {
+            "gemini-3.5-flash": get_token_pricing("gemini-3.5-flash"),
             "gemini-3.1-pro-preview": get_token_pricing("gemini-3.1-pro-preview"),
+            "gemini-3.1-flash-lite": get_token_pricing("gemini-3.1-flash-lite"),
             "gemini-3.1-flash-lite-preview": get_token_pricing("gemini-3.1-flash-lite-preview"),
             "gemini-3-pro-preview": get_token_pricing("gemini-3-pro-preview"),
             "gemini-2.5-pro": get_token_pricing("gemini-2.5-pro"),
@@ -210,7 +214,10 @@ def _calculate_cost(self, input_tokens: int, output_tokens: int, model: str) ->
             return self.deep_research_cost_estimate
 
         base_model = model
-        for key in self.pricing:
+        # Match the longest pricing key first so e.g. "gemini-2.5-flash-lite"
+        # resolves to its own entry instead of the shorter "gemini-2.5-flash"
+        # prefix — which would charge Flash-Lite at ~5x the Flash rate.
+        for key in sorted(self.pricing, key=len, reverse=True):
             if key in model:
                 base_model = key
                 break

diff --git a/deepr/providers/grok_provider.py b/deepr/providers/grok_provider.py
@@ -72,6 +72,12 @@ def __init__(
             "grok-4.20-non-reasoning": "grok-4.20-0309-non-reasoning",
             "grok-4.20-multi-agent": "grok-4.20-multi-agent-0309",
             "grok-4.20": "grok-4.20-0309-non-reasoning",
+            # Hyphenated registry forms (registry keys use grok-4-20-*, not the
+            # dotted API form). Without these, a routed "grok-4-20-reasoning"
+            # falls through unmapped → wrong API id + ~11x cost undercharge.
+            "grok-4-20-reasoning": "grok-4.20-0309-reasoning",
+            "grok-4-20-non-reasoning": "grok-4.20-0309-non-reasoning",
+            "grok-4-20-multi-agent": "grok-4.20-multi-agent-0309",
             # Grok 4.1 Fast budget tier
             "grok-4-1-fast-reasoning": "grok-4-1-fast-reasoning",
             "grok-4-1-fast-non-reasoning": "grok-4-1-fast-non-reasoning",
@@ -98,10 +104,15 @@ def __init__(
 
         _grok_4_1_fast = get_token_pricing("grok-4-1-fast-non-reasoning")
         self.pricing = {
-            # Grok 4.20 flagship ($2/$6 per MTok)
+            # Grok 4.20 flagship ($2/$6 per MTok). Keyed under the dotted API
+            # ids and the hyphenated registry forms so cost accounting is correct
+            # regardless of which name reaches _calculate_cost.
             "grok-4.20-0309-reasoning": {"input": 2.00, "output": 6.00},
             "grok-4.20-0309-non-reasoning": {"input": 2.00, "output": 6.00},
             "grok-4.20-multi-agent-0309": {"input": 2.00, "output": 6.00},
+            "grok-4-20-reasoning": {"input": 2.00, "output": 6.00},
+            "grok-4-20-non-reasoning": {"input": 2.00, "output": 6.00},
+            "grok-4-20-multi-agent": {"input": 2.00, "output": 6.00},
             # Grok 4.3 ($1.25/$2.50 per MTok). Listed under both the canonical
             # name and the hyphenated alias so any caller / older code path
             # that submits "grok-4-3" still gets accurate cost accounting.

diff --git a/deepr/providers/registry.py b/deepr/providers/registry.py
@@ -67,6 +67,47 @@ class ModelCapability:
         input_cost_per_1m=30.00,
         output_cost_per_1m=180.00,
     ),
+    "openai/gpt-5.4-mini": ModelCapability(
+        provider="openai",
+        model="gpt-5.4-mini",
+        cost_per_query=0.05,
+        latency_ms=1500,
+        context_window=400_000,
+        specializations=["reasoning", "speed", "balanced", "agentic"],
+        strengths=[
+            "Newer-generation budget reasoning (GPT-5.4 family)",
+            "Good reasoning at low cost",
+            "Fast responses",
+            "400K context window",
+            "Configurable reasoning effort",
+        ],
+        weaknesses=[
+            "Pricier than gpt-5-mini ($0.75/$4.50 vs $0.25/$2.00 per MTok)",
+            "Less capable than full gpt-5.4",
+        ],
+        input_cost_per_1m=0.75,
+        output_cost_per_1m=4.50,
+    ),
+    "openai/gpt-5.4-nano": ModelCapability(
+        provider="openai",
+        model="gpt-5.4-nano",
+        cost_per_query=0.01,
+        latency_ms=800,
+        context_window=400_000,
+        specializations=["speed", "cost", "general", "summarization"],
+        strengths=[
+            "Cheapest GPT-5.4 variant ($0.20/$1.25 per MTok)",
+            "Very fast responses",
+            "400K context window",
+            "Good for summarization and classification",
+        ],
+        weaknesses=[
+            "Lowest reasoning capability in GPT-5.4 family",
+            "Pricier than gpt-5-nano ($0.20/$1.25 vs $0.05/$0.40 per MTok)",
+        ],
+        input_cost_per_1m=0.20,
+        output_cost_per_1m=1.25,
+    ),
     "openai/gpt-5.5": ModelCapability(
         provider="openai",
         model="gpt-5.5",
@@ -525,6 +566,29 @@ class ModelCapability:
         successor="xai/grok-imagine-image",
     ),
     # Google Models (Gemini)
+    # Gemini 3.5 Flash — newest Flash generation (GA May 19, 2026, Google I/O 2026)
+    "gemini/gemini-3.5-flash": ModelCapability(
+        provider="gemini",
+        model="gemini-3.5-flash",
+        cost_per_query=0.03,
+        latency_ms=1500,
+        context_window=1_000_000,
+        specializations=["reasoning", "coding", "agentic", "multimodal", "speed", "thinking"],
+        strengths=[
+            "First model in the Gemini 3.5 family (GA May 19, 2026)",
+            "Surpasses Gemini 3.1 Pro on coding, agentic, and multimodal benchmarks",
+            "Frontier intelligence at Flash speed (~4x faster output than frontier peers)",
+            "1M token context window, 65K output",
+            "Multimodal input (text, image, audio, video, PDF)",
+            "Dynamic thinking",
+        ],
+        weaknesses=[
+            "3x pricier than Gemini 3 Flash preview ($1.50/$9.00 vs $0.50/$3.00 per MTok)",
+            "Thinking tokens add to output cost",
+        ],
+        input_cost_per_1m=1.50,
+        output_cost_per_1m=9.00,  # Includes thinking tokens; non-global regions $1.65/$9.90
+    ),
     "gemini/gemini-3-flash-preview": ModelCapability(
         provider="gemini",
         model="gemini-3-flash-preview",
@@ -538,10 +602,34 @@ class ModelCapability:
             "1M token context window",
             "Dynamic thinking",
         ],
-        weaknesses=["Preview model (may change)", "Thinking tokens add to output cost"],
+        weaknesses=[
+            "Preview model (may change)",
+            "Thinking tokens add to output cost",
+            "Superseded for quality by gemini-3.5-flash (which costs ~3x more)",
+        ],
         input_cost_per_1m=0.50,
         output_cost_per_1m=3.00,  # Includes thinking tokens
     ),
+    # Gemini 3.1 Flash-Lite — GA (May 7, 2026); most cost-effective Gemini
+    "gemini/gemini-3.1-flash-lite": ModelCapability(
+        provider="gemini",
+        model="gemini-3.1-flash-lite",
+        cost_per_query=0.007,
+        latency_ms=1300,
+        context_window=1_000_000,
+        specializations=["speed", "cost", "general", "high_throughput", "thinking"],
+        strengths=[
+            "Most cost-effective Gemini model (GA May 7, 2026)",
+            "1M token context window",
+            "Low-cost high-throughput inference",
+            "Dynamic thinking",
+        ],
+        weaknesses=[
+            "Less capable than Pro/Flash models on deep reasoning",
+        ],
+        input_cost_per_1m=0.25,
+        output_cost_per_1m=1.50,
+    ),
     "gemini/gemini-3.1-flash-lite-preview": ModelCapability(
         provider="gemini",
         model="gemini-3.1-flash-lite-preview",
@@ -557,6 +645,7 @@ class ModelCapability:
         weaknesses=[
             "Preview model (lifecycle may change)",
             "Less capable than Pro models on deep reasoning",
+            "Superseded by GA gemini-3.1-flash-lite ($0.25/$1.50 per MTok)",
         ],
         input_cost_per_1m=0.20,
         output_cost_per_1m=1.20,
@@ -676,6 +765,30 @@ class ModelCapability:
     # Note: Anthropic does NOT have a turnkey deep research API like OpenAI/Gemini.
     # Research capability is achieved via Extended Thinking + tool use + our orchestration.
     # For research, we recommend Opus 4.6 - best reasoning with Adaptive Thinking.
+    # Claude Opus 4.7 — most capable Claude (GA Apr 16, 2026); leads SWE-bench Pro
+    "anthropic/claude-opus-4-7": ModelCapability(
+        provider="anthropic",
+        model="claude-opus-4-7",
+        cost_per_query=0.85,  # Same per-token rate as 4.6, but new tokenizer (~35% more tokens)
+        latency_ms=12000,
+        context_window=1_000_000,  # Full 1M at standard pricing
+        specializations=["research", "reasoning", "coding", "analysis", "complex_tasks", "agents"],
+        strengths=[
+            "Most capable Claude model (GA Apr 16, 2026)",
+            "Leads SWE-bench Pro (64.3%)",
+            "Adaptive Thinking (auto-adjusts reasoning effort)",
+            "Full 1M token context window at standard pricing",
+            "128K max output tokens",
+            "Fast mode available (6x price for faster output)",
+        ],
+        weaknesses=[
+            "No native deep research API (requires orchestration)",
+            "New tokenizer uses up to 35% more tokens for the same text (higher effective cost)",
+            "Slower than Sonnet (~12s vs ~3s)",
+        ],
+        input_cost_per_1m=5.00,
+        output_cost_per_1m=25.00,
+    ),
     "anthropic/claude-opus-4-6": ModelCapability(
         provider="anthropic",
         model="claude-opus-4-6",
@@ -695,10 +808,33 @@ class ModelCapability:
             "No native deep research API (requires orchestration)",
             "Slower than Sonnet (~12s vs ~3s)",
             "Higher cost than Sonnet (~$0.80 vs ~$0.48/query)",
+            "Superseded by claude-opus-4-7 (same price)",
         ],
         input_cost_per_1m=5.00,
         output_cost_per_1m=25.00,
     ),
+    # Claude Sonnet 4.6 — best value for everyday coding (GA Apr 2026)
+    "anthropic/claude-sonnet-4-6": ModelCapability(
+        provider="anthropic",
+        model="claude-sonnet-4-6",
+        cost_per_query=0.48,  # Estimated with 16K thinking budget
+        latency_ms=3000,
+        context_window=1_000_000,  # Full 1M at standard pricing
+        specializations=["reasoning", "coding", "analysis", "balanced", "agents"],
+        strengths=[
+            "Best speed/intelligence balance; best value for everyday coding",
+            "Fast responses (~3s)",
+            "Extended Thinking support",
+            "Full 1M token context window at standard pricing",
+            "64K max output tokens",
+        ],
+        weaknesses=[
+            "Less capable than Opus 4.7 for complex research",
+            "No native deep research API",
+        ],
+        input_cost_per_1m=3.00,
+        output_cost_per_1m=15.00,
+    ),
     "anthropic/claude-sonnet-4-5": ModelCapability(
         provider="anthropic",
         model="claude-sonnet-4-5",
@@ -717,6 +853,7 @@ class ModelCapability:
             "Less capable than Opus 4.6 for complex research",
             "No Adaptive Thinking",
             "No native deep research API",
+            "Superseded by claude-sonnet-4-6 (same price)",
         ],
         input_cost_per_1m=3.00,
         output_cost_per_1m=15.00,
@@ -974,20 +1111,29 @@ def get_cost_estimate(model: str, input_tokens: int | None = None) -> float:
         Estimated cost per query in USD. Returns 0.20 if model not found.
     """
     resolved = _MODEL_ALIASES.get(model, model)
+    needle = _normalize_model_name(resolved)
     base = 0.20
+
+    # Exact match (normalized) first.
     for cap in MODEL_CAPABILITIES.values():
-        if cap.model == resolved:
+        if _normalize_model_name(cap.model) == needle:
             base = cap.cost_per_query
             break
     else:
-        for cap in MODEL_CAPABILITIES.values():
-            if cap.model in resolved:
+        # Partial match — longest cap.model first so e.g. a "gpt-5.4-mini"
+        # snapshot matches its own entry before the shorter "gpt-5.4" prefix.
+        # Without longest-first this both over-charges (mini -> full price) and,
+        # worse, under-charges ("gpt-5.4-pro-<date>" -> cheaper "gpt-5.4"),
+        # letting budget pre-flight approve an expensive job. Mirrors
+        # get_token_pricing().
+        for cap in sorted(MODEL_CAPABILITIES.values(), key=lambda c: len(c.model or ""), reverse=True):
+            if _normalize_model_name(cap.model) in needle:
                 base = cap.cost_per_query
                 break
 
     if input_tokens is not None:
         for tiered_model, (threshold, multiplier) in _TIERED_PRICING.items():
-            if tiered_model in resolved and input_tokens > threshold:
+            if _normalize_model_name(tiered_model) in needle and input_tokens > threshold:
                 return round(base * multiplier, 4)
 
     return base