diff --git a/ROADMAP.md b/ROADMAP.md
index cdbc725..1f9c026 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -1,6 +1,6 @@
 # Deepr Roadmap
 
-> Development priorities and planned features. Model/pricing notes updated through March 2026.
+> Development priorities and planned features. Model/pricing notes updated through May 2026.
 
 ## Quick Links
 
@@ -45,7 +45,7 @@ These features are well-tested and used regularly:
 - **Expert creation**: `expert make`, `expert chat`, `expert export/import`
 - **CLI output modes**: `--verbose`, `--json`, `--quiet`, `--explain`
 - **Context discovery**: `deepr search`, `--context <id>` for reusing prior research
-- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus/Sonnet/Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o)
+- **Provider support**: OpenAI (GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, o3/o4-mini-deep-research), Gemini (3.1 Pro Preview, 3.5 Flash, 3 Flash, 2.5 Flash, Deep Research Agent), xAI Grok (4.3 flagship, 4.20 Reasoning/Non-Reasoning/Multi-Agent), Anthropic (Claude Opus 4.7/4.6, Sonnet 4.6/4.5, Haiku 4.5), Azure AI Foundry (o3-deep-research + Bing, GPT-5/5-mini, GPT-4.1/4.1-mini, GPT-4o)
 - **Local storage**: SQLite persistence, markdown reports, expert profiles
 
 ### Experimental (Works but Evolving)
@@ -64,7 +64,7 @@ These features work but APIs or behavior may change:
 
 ### What Works (Full List)
 
-- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry)
+- Multi-provider support (OpenAI GPT-5.4/5-mini/4.1, Gemini 3.5 Flash/3.1 Pro/Flash-Lite/2.5, Grok 4.3/4.20, Anthropic Claude, Azure, Azure AI Foundry)
 - Deep Research via OpenAI API (o3/o4-mini-deep-research) and Gemini Interactions API (Deep Research Agent)
 - Semantic commands (`research`, `learn`, `team`, `check`, `make`)
 - Expert system with autonomous learning, agentic chat (streaming, 27 slash commands, 4 chat modes, visible reasoning, context compaction, approval flows, expert council, task planning, memory commands), knowledge synthesis, curriculum preview (`expert plan`), domain-specific skills, AI-generated portraits
@@ -214,8 +214,10 @@ See [docs/INTEGRATIONS.md](docs/INTEGRATIONS.md) for the full integration contra
 
 Goal: continuously validate routing quality/cost claims with measurable feedback.
 
-- [ ] `deepr providers models` command (model discovery UX)
+- [x] `deepr providers models` command (model discovery UX): live provider model lists diffed against the registry, scoped by default to newer versions of families already in use, with paste-ready registry stubs (`scripts/discover_models.py`)
 - [ ] Stale-model CI checks + provider-family alerting
+  - [x] `deepr eval` preflight warns when newer relevant models are missing from the registry
+  - [ ] Scheduled CI job that alerts on provider model drift
 - [ ] Routing preview: `deepr research --preview --auto` shows exact model choice, estimated cost, and confidence before executing
 - [ ] Eval methodology v2:
   - [ ] Citation quality, grounding, synthesis depth, temporal accuracy
diff --git a/deepr/api/app.py b/deepr/api/app.py
index 8acf048..6e19e29 100644
--- a/deepr/api/app.py
+++ b/deepr/api/app.py
@@ -667,8 +667,12 @@ def submit_job():
     # Update status
     run_async(queue.update_status(job_id=job_id, status=JobStatus.PROCESSING, provider_job_id=provider_job_id))
 
-    # Calculate cost estimate
-    avg_cost = 0.5 if "mini" in model else 5.0
+    # Calculate cost estimate from the registry (source of truth). A prior
+    # name heuristic ("mini" -> $0.5 else $5.0) wildly misestimated nano /
+    # flash-lite (over) and deep-research (under) models.
+    from deepr.providers.registry import get_cost_estimate
+
+    avg_cost = get_cost_estimate(model)
     estimated_cost = {
         "min_cost": avg_cost * 0.5,
         "max_cost": avg_cost * 2.0,
diff --git a/deepr/cli/commands/providers.py b/deepr/cli/commands/providers.py
index 5a1d6df..5757a08 100644
--- a/deepr/cli/commands/providers.py
+++ b/deepr/cli/commands/providers.py
@@ -460,6 +460,54 @@ def benchmark(quick: bool, target_provider: Optional[str], iterations: int, hist
         console.print(f"\n[green]Fastest:[/green] {best[0]} ({best[1]['avg_latency']:.0f}ms avg)")
 
 
+@providers.command()
+@click.option(
+    "--all",
+    "show_all",
+    is_flag=True,
+    help="Show every discovered model, not just newer versions of families already in the registry",
+)
+@click.option(
+    "--provider",
+    "target_provider",
+    type=click.Choice(["openai", "anthropic", "gemini", "xai", "azure-foundry"]),
+    help="Only check this provider",
+)
+@click.option("--json", "json_output", is_flag=True, help="Output as JSON (for CI / scripting)")
+@click.option("--no-stubs", is_flag=True, help="Don't print suggested registry-entry stubs")
+def models(show_all: bool, target_provider: Optional[str], json_output: bool, no_stubs: bool):
+    """Discover newer provider models missing from the registry.
+
+    Queries each configured provider's live model list and flags newer versions
+    of model families already in the registry (e.g. a new mini/nano tier, or a
+    preview that has gone GA). Use --all to see every discovered model.
+
+    Examples:
+        deepr providers models
+        deepr providers models --provider openai
+        deepr providers models --all
+        deepr providers models --json
+    """
+    import subprocess
+    import sys
+    from pathlib import Path
+
+    script = Path(__file__).resolve().parents[3] / "scripts" / "discover_models.py"
+    cmd = [sys.executable, str(script)]
+    if show_all:
+        cmd.append("--all")
+    if no_stubs:
+        cmd.append("--no-stubs")
+    if json_output:
+        cmd += ["--format", "json"]
+    if target_provider:
+        cmd += ["--provider", target_provider]
+
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        raise click.ClickException(f"Model discovery exited with status {result.returncode}")
+
+
 @providers.command()
 def list():
     """List all available providers and models."""
diff --git a/deepr/providers/gemini_provider.py b/deepr/providers/gemini_provider.py
index a92aab8..aaf6dab 100644
--- a/deepr/providers/gemini_provider.py
+++ b/deepr/providers/gemini_provider.py
@@ -147,8 +147,10 @@ def __init__(
 
         # Model mappings for convenience
         self.model_mappings = model_mappings or {
+            "gemini-3.5-flash": "gemini-3.5-flash",
             "gemini-3.1-pro-preview": "gemini-3.1-pro-preview",
             "gemini-3.1-pro": "gemini-3.1-pro-preview",
+            "gemini-3.1-flash-lite": "gemini-3.1-flash-lite",
             "gemini-3.1-flash-lite-preview": "gemini-3.1-flash-lite-preview",
             "gemini-3-pro-preview": "gemini-3-pro-preview",
             "gemini-2.5-pro": "gemini-2.5-pro",
@@ -166,7 +168,9 @@ def __init__(
         from .registry import get_token_pricing
 
         self.pricing = {
+            "gemini-3.5-flash": get_token_pricing("gemini-3.5-flash"),
             "gemini-3.1-pro-preview": get_token_pricing("gemini-3.1-pro-preview"),
+            "gemini-3.1-flash-lite": get_token_pricing("gemini-3.1-flash-lite"),
             "gemini-3.1-flash-lite-preview": get_token_pricing("gemini-3.1-flash-lite-preview"),
             "gemini-3-pro-preview": get_token_pricing("gemini-3-pro-preview"),
             "gemini-2.5-pro": get_token_pricing("gemini-2.5-pro"),
@@ -210,7 +214,10 @@ def _calculate_cost(self, input_tokens: int, output_tokens: int, model: str) ->
             return self.deep_research_cost_estimate
 
         base_model = model
-        for key in self.pricing:
+        # Match the longest pricing key first so e.g. "gemini-2.5-flash-lite"
+        # resolves to its own entry instead of the shorter "gemini-2.5-flash"
+        # prefix — which would charge Flash-Lite at ~5x the Flash rate.
+        for key in sorted(self.pricing, key=len, reverse=True):
             if key in model:
                 base_model = key
                 break
diff --git a/deepr/providers/grok_provider.py b/deepr/providers/grok_provider.py
index bf3fb3b..da71e55 100644
--- a/deepr/providers/grok_provider.py
+++ b/deepr/providers/grok_provider.py
@@ -72,6 +72,12 @@ def __init__(
             "grok-4.20-non-reasoning": "grok-4.20-0309-non-reasoning",
             "grok-4.20-multi-agent": "grok-4.20-multi-agent-0309",
             "grok-4.20": "grok-4.20-0309-non-reasoning",
+            # Hyphenated registry forms (registry keys use grok-4-20-*, not the
+            # dotted API form). Without these, a routed "grok-4-20-reasoning"
+            # falls through unmapped → wrong API id + ~11x cost undercharge.
+            "grok-4-20-reasoning": "grok-4.20-0309-reasoning",
+            "grok-4-20-non-reasoning": "grok-4.20-0309-non-reasoning",
+            "grok-4-20-multi-agent": "grok-4.20-multi-agent-0309",
             # Grok 4.1 Fast budget tier
             "grok-4-1-fast-reasoning": "grok-4-1-fast-reasoning",
             "grok-4-1-fast-non-reasoning": "grok-4-1-fast-non-reasoning",
@@ -98,10 +104,15 @@ def __init__(
 
         _grok_4_1_fast = get_token_pricing("grok-4-1-fast-non-reasoning")
         self.pricing = {
-            # Grok 4.20 flagship ($2/$6 per MTok)
+            # Grok 4.20 flagship ($2/$6 per MTok). Keyed under the dotted API
+            # ids and the hyphenated registry forms so cost accounting is correct
+            # regardless of which name reaches _calculate_cost.
             "grok-4.20-0309-reasoning": {"input": 2.00, "output": 6.00},
             "grok-4.20-0309-non-reasoning": {"input": 2.00, "output": 6.00},
             "grok-4.20-multi-agent-0309": {"input": 2.00, "output": 6.00},
+            "grok-4-20-reasoning": {"input": 2.00, "output": 6.00},
+            "grok-4-20-non-reasoning": {"input": 2.00, "output": 6.00},
+            "grok-4-20-multi-agent": {"input": 2.00, "output": 6.00},
             # Grok 4.3 ($1.25/$2.50 per MTok). Listed under both the canonical
             # name and the hyphenated alias so any caller / older code path
             # that submits "grok-4-3" still gets accurate cost accounting.
diff --git a/deepr/providers/registry.py b/deepr/providers/registry.py
index 79d3242..3d34a44 100644
--- a/deepr/providers/registry.py
+++ b/deepr/providers/registry.py
@@ -67,6 +67,47 @@ class ModelCapability:
         input_cost_per_1m=30.00,
         output_cost_per_1m=180.00,
     ),
+    "openai/gpt-5.4-mini": ModelCapability(
+        provider="openai",
+        model="gpt-5.4-mini",
+        cost_per_query=0.05,
+        latency_ms=1500,
+        context_window=400_000,
+        specializations=["reasoning", "speed", "balanced", "agentic"],
+        strengths=[
+            "Newer-generation budget reasoning (GPT-5.4 family)",
+            "Good reasoning at low cost",
+            "Fast responses",
+            "400K context window",
+            "Configurable reasoning effort",
+        ],
+        weaknesses=[
+            "Pricier than gpt-5-mini ($0.75/$4.50 vs $0.25/$2.00 per MTok)",
+            "Less capable than full gpt-5.4",
+        ],
+        input_cost_per_1m=0.75,
+        output_cost_per_1m=4.50,
+    ),
+    "openai/gpt-5.4-nano": ModelCapability(
+        provider="openai",
+        model="gpt-5.4-nano",
+        cost_per_query=0.01,
+        latency_ms=800,
+        context_window=400_000,
+        specializations=["speed", "cost", "general", "summarization"],
+        strengths=[
+            "Cheapest GPT-5.4 variant ($0.20/$1.25 per MTok)",
+            "Very fast responses",
+            "400K context window",
+            "Good for summarization and classification",
+        ],
+        weaknesses=[
+            "Lowest reasoning capability in GPT-5.4 family",
+            "Pricier than gpt-5-nano ($0.20/$1.25 vs $0.05/$0.40 per MTok)",
+        ],
+        input_cost_per_1m=0.20,
+        output_cost_per_1m=1.25,
+    ),
     "openai/gpt-5.5": ModelCapability(
         provider="openai",
         model="gpt-5.5",
@@ -525,6 +566,29 @@ class ModelCapability:
         successor="xai/grok-imagine-image",
     ),
     # Google Models (Gemini)
+    # Gemini 3.5 Flash — newest Flash generation (GA May 19, 2026, Google I/O 2026)
+    "gemini/gemini-3.5-flash": ModelCapability(
+        provider="gemini",
+        model="gemini-3.5-flash",
+        cost_per_query=0.03,
+        latency_ms=1500,
+        context_window=1_000_000,
+        specializations=["reasoning", "coding", "agentic", "multimodal", "speed", "thinking"],
+        strengths=[
+            "First model in the Gemini 3.5 family (GA May 19, 2026)",
+            "Surpasses Gemini 3.1 Pro on coding, agentic, and multimodal benchmarks",
+            "Frontier intelligence at Flash speed (~4x faster output than frontier peers)",
+            "1M token context window, 65K output",
+            "Multimodal input (text, image, audio, video, PDF)",
+            "Dynamic thinking",
+        ],
+        weaknesses=[
+            "3x pricier than Gemini 3 Flash preview ($1.50/$9.00 vs $0.50/$3.00 per MTok)",
+            "Thinking tokens add to output cost",
+        ],
+        input_cost_per_1m=1.50,
+        output_cost_per_1m=9.00,  # Includes thinking tokens; non-global regions $1.65/$9.90
+    ),
     "gemini/gemini-3-flash-preview": ModelCapability(
         provider="gemini",
         model="gemini-3-flash-preview",
@@ -538,10 +602,34 @@ class ModelCapability:
             "1M token context window",
             "Dynamic thinking",
         ],
-        weaknesses=["Preview model (may change)", "Thinking tokens add to output cost"],
+        weaknesses=[
+            "Preview model (may change)",
+            "Thinking tokens add to output cost",
+            "Superseded for quality by gemini-3.5-flash (which costs ~3x more)",
+        ],
         input_cost_per_1m=0.50,
         output_cost_per_1m=3.00,  # Includes thinking tokens
     ),
+    # Gemini 3.1 Flash-Lite — GA (May 7, 2026); most cost-effective Gemini
+    "gemini/gemini-3.1-flash-lite": ModelCapability(
+        provider="gemini",
+        model="gemini-3.1-flash-lite",
+        cost_per_query=0.007,
+        latency_ms=1300,
+        context_window=1_000_000,
+        specializations=["speed", "cost", "general", "high_throughput", "thinking"],
+        strengths=[
+            "Most cost-effective Gemini model (GA May 7, 2026)",
+            "1M token context window",
+            "Low-cost high-throughput inference",
+            "Dynamic thinking",
+        ],
+        weaknesses=[
+            "Less capable than Pro/Flash models on deep reasoning",
+        ],
+        input_cost_per_1m=0.25,
+        output_cost_per_1m=1.50,
+    ),
     "gemini/gemini-3.1-flash-lite-preview": ModelCapability(
         provider="gemini",
         model="gemini-3.1-flash-lite-preview",
@@ -557,6 +645,7 @@ class ModelCapability:
         weaknesses=[
             "Preview model (lifecycle may change)",
             "Less capable than Pro models on deep reasoning",
+            "Superseded by GA gemini-3.1-flash-lite ($0.25/$1.50 per MTok)",
         ],
         input_cost_per_1m=0.20,
         output_cost_per_1m=1.20,
@@ -676,6 +765,30 @@ class ModelCapability:
     # Note: Anthropic does NOT have a turnkey deep research API like OpenAI/Gemini.
     # Research capability is achieved via Extended Thinking + tool use + our orchestration.
     # For research, we recommend Opus 4.6 - best reasoning with Adaptive Thinking.
+    # Claude Opus 4.7 — most capable Claude (GA Apr 16, 2026); leads SWE-bench Pro
+    "anthropic/claude-opus-4-7": ModelCapability(
+        provider="anthropic",
+        model="claude-opus-4-7",
+        cost_per_query=0.85,  # Same per-token rate as 4.6, but new tokenizer (~35% more tokens)
+        latency_ms=12000,
+        context_window=1_000_000,  # Full 1M at standard pricing
+        specializations=["research", "reasoning", "coding", "analysis", "complex_tasks", "agents"],
+        strengths=[
+            "Most capable Claude model (GA Apr 16, 2026)",
+            "Leads SWE-bench Pro (64.3%)",
+            "Adaptive Thinking (auto-adjusts reasoning effort)",
+            "Full 1M token context window at standard pricing",
+            "128K max output tokens",
+            "Fast mode available (6x price for faster output)",
+        ],
+        weaknesses=[
+            "No native deep research API (requires orchestration)",
+            "New tokenizer uses up to 35% more tokens for the same text (higher effective cost)",
+            "Slower than Sonnet (~12s vs ~3s)",
+        ],
+        input_cost_per_1m=5.00,
+        output_cost_per_1m=25.00,
+    ),
     "anthropic/claude-opus-4-6": ModelCapability(
         provider="anthropic",
         model="claude-opus-4-6",
@@ -695,10 +808,33 @@ class ModelCapability:
             "No native deep research API (requires orchestration)",
             "Slower than Sonnet (~12s vs ~3s)",
             "Higher cost than Sonnet (~$0.80 vs ~$0.48/query)",
+            "Superseded by claude-opus-4-7 (same price)",
         ],
         input_cost_per_1m=5.00,
         output_cost_per_1m=25.00,
     ),
+    # Claude Sonnet 4.6 — best value for everyday coding (GA Apr 2026)
+    "anthropic/claude-sonnet-4-6": ModelCapability(
+        provider="anthropic",
+        model="claude-sonnet-4-6",
+        cost_per_query=0.48,  # Estimated with 16K thinking budget
+        latency_ms=3000,
+        context_window=1_000_000,  # Full 1M at standard pricing
+        specializations=["reasoning", "coding", "analysis", "balanced", "agents"],
+        strengths=[
+            "Best speed/intelligence balance; best value for everyday coding",
+            "Fast responses (~3s)",
+            "Extended Thinking support",
+            "Full 1M token context window at standard pricing",
+            "64K max output tokens",
+        ],
+        weaknesses=[
+            "Less capable than Opus 4.7 for complex research",
+            "No native deep research API",
+        ],
+        input_cost_per_1m=3.00,
+        output_cost_per_1m=15.00,
+    ),
     "anthropic/claude-sonnet-4-5": ModelCapability(
         provider="anthropic",
         model="claude-sonnet-4-5",
@@ -717,6 +853,7 @@ class ModelCapability:
             "Less capable than Opus 4.6 for complex research",
             "No Adaptive Thinking",
             "No native deep research API",
+            "Superseded by claude-sonnet-4-6 (same price)",
         ],
         input_cost_per_1m=3.00,
         output_cost_per_1m=15.00,
@@ -974,20 +1111,29 @@ def get_cost_estimate(model: str, input_tokens: int | None = None) -> float:
         Estimated cost per query in USD. Returns 0.20 if model not found.
     """
     resolved = _MODEL_ALIASES.get(model, model)
+    needle = _normalize_model_name(resolved)
     base = 0.20
+
+    # Exact match (normalized) first.
     for cap in MODEL_CAPABILITIES.values():
-        if cap.model == resolved:
+        if _normalize_model_name(cap.model) == needle:
             base = cap.cost_per_query
             break
     else:
-        for cap in MODEL_CAPABILITIES.values():
-            if cap.model in resolved:
+        # Partial match — longest cap.model first so e.g. a "gpt-5.4-mini"
+        # snapshot matches its own entry before the shorter "gpt-5.4" prefix.
+        # Without longest-first this both over-charges (mini -> full price) and,
+        # worse, under-charges ("gpt-5.4-pro-<date>" -> cheaper "gpt-5.4"),
+        # letting budget pre-flight approve an expensive job. Mirrors
+        # get_token_pricing().
+        for cap in sorted(MODEL_CAPABILITIES.values(), key=lambda c: len(c.model or ""), reverse=True):
+            if _normalize_model_name(cap.model) in needle:
                 base = cap.cost_per_query
                 break
 
     if input_tokens is not None:
         for tiered_model, (threshold, multiplier) in _TIERED_PRICING.items():
-            if tiered_model in resolved and input_tokens > threshold:
+            if _normalize_model_name(tiered_model) in needle and input_tokens > threshold:
                 return round(base * multiplier, 4)
 
     return base
diff --git a/docs/MODELS.md b/docs/MODELS.md
index cd5d155..cda104c 100644
--- a/docs/MODELS.md
+++ b/docs/MODELS.md
@@ -1,6 +1,6 @@
 # Model Selection Guide
 
-> **Note**: Model information current as of March 2026. AI models evolve rapidly — verify current pricing at provider websites. The [model registry](../deepr/providers/registry.py) is the source of truth. Run `python scripts/discover_models.py --show-registry` to see all registered models with pricing.
+> **Note**: Model information current as of May 2026. AI models evolve rapidly — verify current pricing at provider websites. The [model registry](../deepr/providers/registry.py) is the source of truth. Run `deepr providers models` to diff the registry against each provider's live model list (flags newer versions of families you already use, with paste-ready registry stubs), or `python scripts/discover_models.py --show-registry` to see all registered models with pricing.
 
 ## Overview
 
@@ -12,14 +12,15 @@ Deepr uses a hybrid approach optimizing for both quality and cost. Different tas
 
 ### OpenAI (`OPENAI_API_KEY`)
 - **Deep Research**: Turnkey async Deep Research API via Responses endpoint
-- **Models**: o3-deep-research, o4-mini-deep-research, GPT-5.4, GPT-5.4-pro, GPT-5-mini, GPT-4.1, GPT-4.1-mini
+- **Models**: o3-deep-research, o4-mini-deep-research, GPT-5.5, GPT-5.4, GPT-5.4-pro, GPT-5.4-mini, GPT-5.4-nano, GPT-5-mini, GPT-4.1, GPT-4.1-mini
 - **Best for**: Deep research, planning, expert system (vector stores require OpenAI-compatible API)
 - **Note**: GPT-5.4 is the current mainline default; use GPT-5.4-pro for hardest tasks and GPT-5-mini for value
 
 ### Google Gemini (`GEMINI_API_KEY`)
 - **Deep Research**: Native Deep Research Agent via Interactions API (async background jobs)
-- **Models**: Gemini 3.1 Pro Preview (default), Gemini 3 Flash, Gemini 2.5 Flash, Deep Research Agent (`deep-research-pro-preview-12-2025`)
+- **Models**: Gemini 3.1 Pro Preview (default), Gemini 3.5 Flash, Gemini 3 Flash, Gemini 3.1 Flash-Lite (GA), Gemini 2.5 Flash, Deep Research Agent (`deep-research-pro-preview-12-2025`)
 - **Best for**: Large context windows (1M+ tokens), document analysis, cost-effective research, agentic workflows
+- **Note**: Gemini 3.5 Flash (GA May 19, 2026, Google I/O 2026) is the newest Flash generation — it beats Gemini 3.1 Pro on coding/agentic/multimodal benchmarks at ~4x faster output, priced at $1.50/$9.00 per MTok
 
 ### xAI Grok (`XAI_API_KEY`)
 - **Deep Research**: Grok 4.20 Multi-Agent (4–16 parallel agents with autonomous tool use)
@@ -67,9 +68,9 @@ The following legacy models will stop accepting API requests on **May 15, 2026 a
 
 ### Anthropic Claude (`ANTHROPIC_API_KEY`)
 - **Deep Research**: No turnkey API — uses Extended Thinking + tool use + web search orchestration
-- **Models**: Claude Opus 4.6, Claude Sonnet 4.5, Claude Haiku 4.5
+- **Models**: Claude Opus 4.7, Claude Opus 4.6, Claude Sonnet 4.6, Claude Sonnet 4.5, Claude Haiku 4.5
 - **Best for**: Complex reasoning with transparent thinking, coding tasks, nuanced analysis
-- **Note**: Opus 4.6 (latest) recommended for research (~$0.80/query). All models support Extended Thinking. Requires a web search backend (Brave, Tavily, or DuckDuckGo)
+- **Note**: Opus 4.7 (GA Apr 16, 2026; leads SWE-bench Pro at 64.3%) is the latest and recommended for research (~$0.85/query, $5/$25 per MTok — note its new tokenizer uses up to ~35% more tokens for the same text). Sonnet 4.6 ($3/$15) is the best-value coding model. Opus 4.7/4.6 and Sonnet 4.6 include the full 1M context window at standard pricing. All models support Extended Thinking. Requires a web search backend (Brave, Tavily, or DuckDuckGo)
 
 ### Azure OpenAI (`AZURE_OPENAI_KEY`)
 - **Models**: Same as OpenAI, deployed through Azure
@@ -157,8 +158,9 @@ The agent automatically uses Bing web grounding to find and cite current sources
 | Quick Lookups | Grok 4.20 Non-Reasoning | see registry | ~1s | best value for freshness/citation tasks |
 | Latest News / Web | Grok 4.20 Non-Reasoning | see registry | ~1s | real-time web + strong value |
 | Large Documents | Gemini 3.1 Pro | $0.20* | ~40s | 1M token context, configurable thinking |
-| Coding Tasks | Claude Sonnet 4.5 | $0.48 | ~3s | Best for code |
-| Complex Reasoning | Claude Opus 4.6 | see registry | ~seconds | high-end complex reasoning |
+| Fast Coding / Agentic | Gemini 3.5 Flash | ~$0.03 | ~1.5s | Beats 3.1 Pro on coding/agentic at Flash speed ($1.50/$9.00 per MTok) |
+| Coding Tasks | Claude Sonnet 4.6 | $0.48 | ~3s | Best-value coding ($3/$15 per MTok) |
+| Complex Reasoning | Claude Opus 4.7 | see registry | ~seconds | Most capable Claude; leads SWE-bench Pro |
 | Budget General | GPT-4.1-mini | $0.01 | ~1s | Cheapest OpenAI, 1M context |
 
 *\*Gemini 3.1 Pro has tiered pricing: $2/$12 per 1M tokens (input/output) for prompts ≤200K tokens, $4/$18 for prompts >200K tokens. The $0.20/query estimate assumes a typical sub-200K prompt. Large document analysis (250K+ tokens) costs roughly 2x more — e.g., a 500K-token corpus costs ~$2.27 vs ~$1.18 with sub-200K prompts. Use `--dry-run` to check before running.*
diff --git a/scripts/benchmark_models.py b/scripts/benchmark_models.py
index 5ecea71..99a76cf 100644
--- a/scripts/benchmark_models.py
+++ b/scripts/benchmark_models.py
@@ -636,11 +636,13 @@ class ModelSummary:
     # Frontier models
     "openai/gpt-5.5",  # Newest OpenAI frontier (April 2026, 1M+ context)
     "openai/gpt-5.4",  # Previous OpenAI frontier (1M+ context)
-    "anthropic/claude-opus-4-6",  # Most capable Claude ($0.80/query)
+    "anthropic/claude-opus-4-7",  # Most capable Claude — leads SWE-bench Pro ($0.85/query)
+    "anthropic/claude-opus-4-6",  # Prior most-capable Claude ($0.80/query)
     "gemini/gemini-3.1-pro-preview",  # Latest gen, best quality ($0.20/query)
     "gemini/gemini-2.5-pro",  # Thinking model, can't disable thinking ($0.15/query)
     # Mid-tier
-    "anthropic/claude-sonnet-4-5",  # Strong reasoning ($0.48/query)
+    "anthropic/claude-sonnet-4-6",  # Best-value coding Claude ($0.48/query)
+    "anthropic/claude-sonnet-4-5",  # Prior Sonnet ($0.48/query)
     "openai/gpt-4.1",  # 1M context ($0.04/query)
     "openai/o3",  # Reasoning model for complex tasks ($0.10/query)
     "openai/o4-mini",  # Fast reasoning ($0.04/query)
@@ -649,12 +651,16 @@ class ModelSummary:
     "xai/grok-4-20-reasoning",  # xAI multi-agent workhorse ($0.10/query)
     "xai/grok-4-20-non-reasoning",  # xAI flagship non-reasoning ($0.08/query)
     # Budget models
+    "openai/gpt-5.4-mini",  # Newer-gen budget reasoning ($0.05/query)
     "openai/gpt-5-mini",  # Budget reasoning ($0.03/query)
     "openai/gpt-4.1-mini",  # Cheap 1M context ($0.01/query)
+    "openai/gpt-5.4-nano",  # Cheapest GPT-5.4 ($0.01/query)
     "openai/gpt-5-nano",  # Cheapest GPT-5 ($0.005/query)
     "openai/gpt-4.1-nano",  # Cheapest 1M context ($0.003/query)
-    "gemini/gemini-3-flash-preview",  # Newest gen, fast ($0.01/query)
-    "gemini/gemini-3.1-flash-lite-preview",  # Lowest-cost Gemini 3.1
+    "gemini/gemini-3.5-flash",  # Newest Flash gen — beats 3.1 Pro on coding/agentic ($0.03/query)
+    "gemini/gemini-3-flash-preview",  # Prior gen, fast ($0.01/query)
+    "gemini/gemini-3.1-flash-lite",  # Most cost-effective Gemini, GA ($0.007/query)
+    "gemini/gemini-3.1-flash-lite-preview",  # Prior preview of Flash-Lite
     "anthropic/claude-haiku-4-5",  # Budget Anthropic ($0.05/query)
 ]
 
@@ -674,7 +680,9 @@ class ModelSummary:
     "xai/grok-4-20-non-reasoning",
     # Gemini (Google grounding)
     "gemini/gemini-3.1-pro-preview",
+    "gemini/gemini-3.5-flash",
     "gemini/gemini-3-flash-preview",
+    "gemini/gemini-3.1-flash-lite",
     "gemini/gemini-3.1-flash-lite-preview",
     "gemini/gemini-2.5-flash",
     "gemini/gemini-2.5-pro",
@@ -702,7 +710,7 @@ class ModelSummary:
     "xai/grok-4-20-reasoning",
     "xai/grok-4-20-non-reasoning",
     "xai/grok-4-1-fast-reasoning",
-    "xai/grok-4.3",
+    "xai/grok-4-3",
 ]
 
 # Documentation tier models: web-search-capable models that can fetch + document APIs.
@@ -711,12 +719,14 @@ class ModelSummary:
     "openai/gpt-5-mini",
     "openai/o3",
     "gemini/gemini-3.1-pro-preview",
+    "gemini/gemini-3.5-flash",
     "gemini/gemini-2.5-pro",
+    "gemini/gemini-3.1-flash-lite",
     "gemini/gemini-3.1-flash-lite-preview",
     "xai/grok-4-20-reasoning",
     "xai/grok-4-20-non-reasoning",
     "xai/grok-4-1-fast-reasoning",
-    "xai/grok-4.3",
+    "xai/grok-4-3",
 ]
 
 # Provider → (env var, API base URL)
@@ -753,6 +763,38 @@ def load_registry():
     return mod.MODEL_CAPABILITIES
 
 
+def warn_if_newer_models_available() -> None:
+    """Best-effort preflight: warn if providers expose newer models than the
+    registry knows about.
+
+    Reuses scripts/discover_models.py so the "newer version of a family we use"
+    logic lives in one place. Never raises — discovery is networked and optional,
+    and must not block a benchmark.
+    """
+    try:
+        spec = importlib.util.spec_from_file_location(
+            "discover_models", PROJECT_ROOT / "scripts" / "discover_models.py"
+        )
+        dm = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(dm)
+
+        reg = dm.load_registry()
+        discovered = dm.discover_via_api()
+        if not discovered:
+            return
+        report = dm.compare_registry(reg, discovered)
+        relevant, _ = dm.classify_new_models(report["new_models"], reg)
+        if relevant:
+            names = ", ".join(f"{m.provider}/{m.model_id}" for m in relevant[:6])
+            extra = "" if len(relevant) <= 6 else f" (+{len(relevant) - 6} more)"
+            print(f"  NOTE: {len(relevant)} newer model(s) available but not in the registry: {names}{extra}")
+            print("        Run 'deepr providers models' to review them and get registry stubs.")
+            print()
+    except Exception:
+        # Discovery is best-effort; never block a benchmark on it.
+        pass
+
+
 # ─── Preflight ────────────────────────────────────────────────────────────────
 
 
@@ -2814,6 +2856,11 @@ def main():
         help="Alias for --fill-gaps (run only newly added model+tier combos).",
     )
     parser.add_argument("--workers", type=int, default=5, help="Parallel eval workers (default: 5)")
+    parser.add_argument(
+        "--skip-discovery-check",
+        action="store_true",
+        help="Skip the preflight check for newer provider models not in the registry",
+    )
     parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging")
 
     args = parser.parse_args()
@@ -2894,6 +2941,9 @@ def main():
 
     print_preflight(key_status, all_models, est_total)
 
+    if not args.skip_discovery_check:
+        warn_if_newer_models_available()
+
     if (not args.dry_run) and args.max_estimated_cost is not None and est_total > args.max_estimated_cost:
         print(
             f"\n  ABORT: estimated cost ${est_total:.2f} exceeds --max-estimated-cost ${args.max_estimated_cost:.2f}."
@@ -2958,6 +3008,12 @@ def main():
 
     all_results = compute_combined_scores(all_results, use_judge)
 
+    # Snapshot the results actually executed this run, BEFORE merging prior
+    # history. The merge below adds historical evals (for richer rankings),
+    # but those incurred no new spend — so the reported cost must come from
+    # this snapshot, not the merged dataset.
+    executed_results = list(all_results)
+
     # ─── Merge prior saved results (--fill-gaps) ─────────────────────────
     if args.fill_gaps and prior_saved:
         # Convert prior saved results to EvalResult objects for merged reporting
@@ -2993,8 +3049,14 @@ def main():
     # ─── Phase 4: Report ─────────────────────────────────────────────────
     summaries = build_summaries(all_results, registry)
 
-    # Calculate actual cost
-    total_cost = sum(s.total_cost for s in summaries)
+    # Cost reflects only evals executed this run. When --fill-gaps merges
+    # prior history into the rankings, summing every summary's cost would
+    # report the (huge) cost of re-running the entire dataset rather than
+    # this run's actual spend.
+    if len(executed_results) != len(all_results):
+        total_cost = sum(s.total_cost for s in build_summaries(executed_results, registry))
+    else:
+        total_cost = sum(s.total_cost for s in summaries)
 
     # Save results first (before report printing, which can fail on encoding)
     if args.save:
diff --git a/scripts/discover_models.py b/scripts/discover_models.py
index 492b511..dbc0325 100644
--- a/scripts/discover_models.py
+++ b/scripts/discover_models.py
@@ -33,6 +33,7 @@
 import json
 import logging
 import os
+import re
 import sys
 from dataclasses import asdict, dataclass
 from pathlib import Path
@@ -41,6 +42,15 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(PROJECT_ROOT))
 
+# Load .env so keys configured there (not just exported in the shell) are
+# visible — otherwise discovery silently skips providers like Anthropic/Azure.
+try:
+    from dotenv import load_dotenv
+
+    load_dotenv(PROJECT_ROOT / ".env")
+except ImportError:
+    pass
+
 logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
 
@@ -536,6 +546,31 @@ def _parse_llm_response(text: str) -> list[DiscoveredModel]:
 # ─── Comparison ───────────────────────────────────────────────────────────────
 
 
+_DATE_SUFFIX_RE = re.compile(r"-\d{4}-\d{2}-\d{2}\b")  # -2026-03-05
+_DATE_COMPACT_RE = re.compile(r"-\d{8}\b")  # -20251001
+_SNAPSHOT_RE = re.compile(r"-\d{4}\b(?=-|$)")  # -0309 (xAI dated builds)
+
+
+def _canonical_model(model_id: str) -> str:
+    """Reduce an API model ID to a registry-comparable canonical form.
+
+    Providers expose models with naming variations that exact-match comparison
+    treats as entirely different models, producing huge false-positive diffs:
+      - dot vs hyphen:        grok-4.3            ↔ grok-4-3
+      - trailing date stamps: claude-haiku-4-5-20251001 ↔ claude-haiku-4-5
+                              gpt-5.4-2026-03-05  ↔ gpt-5.4
+      - embedded build dates: grok-4.20-0309-reasoning ↔ grok-4-20-reasoning
+
+    Normalizing both sides before comparison collapses these to the real model.
+    """
+    s = model_id.lower().replace(".", "-")
+    s = _DATE_SUFFIX_RE.sub("", s)
+    s = _DATE_COMPACT_RE.sub("", s)
+    s = _SNAPSHOT_RE.sub("", s)
+    s = re.sub(r"-{2,}", "-", s).strip("-")
+    return s
+
+
 def compare_registry(
     registry: dict[str, RegistryModel],
     discovered: list[DiscoveredModel],
@@ -547,21 +582,29 @@ def compare_registry(
     - pricing_changes: models where pricing differs
     - in_registry: models that match
     - registry_only: models in registry but not discovered
+
+    Matching is done on a canonicalized (provider, model) key so naming
+    variations (dot/hyphen, date snapshots) don't masquerade as new models.
     """
-    # Build lookup from discovered models
-    discovered_lookup: dict[str, DiscoveredModel] = {}
+    # Canonical registry index: (provider, canonical_model) -> RegistryModel
+    registry_by_canon: dict[tuple[str, str], RegistryModel] = {}
+    for rm in registry.values():
+        registry_by_canon[(rm.provider, _canonical_model(rm.model))] = rm
+
+    # Deduplicate discovered models that canonicalize to the same key (e.g. a
+    # dated snapshot plus its alias) so we don't report the same model twice.
+    discovered_by_canon: dict[tuple[str, str], DiscoveredModel] = {}
     for m in discovered:
-        # Normalize key: provider/model_id
-        key = f"{m.provider}/{m.model_id}"
-        discovered_lookup[key] = m
+        discovered_by_canon.setdefault((m.provider, _canonical_model(m.model_id)), m)
 
     new_models = []
     pricing_changes = []
     in_registry = []
 
-    for key, dm in discovered_lookup.items():
-        if key in registry:
-            rm = registry[key]
+    for canon_key, dm in discovered_by_canon.items():
+        rm = registry_by_canon.get(canon_key)
+        if rm is not None:
+            key = f"{rm.provider}/{rm.model}"
             in_registry.append({"key": key, "discovered": dm, "registry": rm})
 
             # Check pricing changes (only if LLM provided pricing)
@@ -579,14 +622,11 @@ def compare_registry(
                         }
                     )
         else:
-            # Check if any registry key contains this model ID (partial match)
-            partial = [k for k in registry if dm.model_id in k]
-            if not partial:
-                new_models.append(dm)
+            new_models.append(dm)
 
-    # Models in registry but not discovered
-    discovered_model_ids = {m.model_id for m in discovered}
-    registry_only = [rm for rm in registry.values() if rm.model not in discovered_model_ids]
+    # Models in registry but not discovered (canonical comparison)
+    discovered_canon = set(discovered_by_canon.keys())
+    registry_only = [rm for rm in registry.values() if (rm.provider, _canonical_model(rm.model)) not in discovered_canon]
 
     return {
         "new_models": new_models,
@@ -628,18 +668,152 @@ def print_registry_table(registry: dict[str, RegistryModel]):
     print(f"  Total: {len(registry)} models\n")
 
 
-def print_comparison_report(report: dict):
-    """Print a human-readable comparison report."""
+# Non-text / non-research model families we never want to surface as routing
+# candidates (image, audio, embeddings, speech, robotics, etc.).
+_NON_TEXT_MARKERS = (
+    "tts",
+    "transcribe",
+    "diarize",
+    "embedding",
+    "image",
+    "audio",
+    "realtime",
+    "robotics",
+    "computer-use",
+    "search-preview",
+    "whisper",
+    "dall-e",
+    "moderation",
+)
+
+_VERSION_TOKEN_RE = re.compile(r"^\d+(?:\.\d+)?$")
+_DATEISH_TOKEN_RE = re.compile(r"^\d{4,8}$")
+_QUALIFIER_TOKENS = {"preview", "latest", "stable", "exp", "experimental"}
+
+
+def _is_text_model(model_id: str) -> bool:
+    """True unless the model is an image/audio/embedding/etc. variant."""
+    mid = model_id.lower()
+    return not any(marker in mid for marker in _NON_TEXT_MARKERS)
+
+
+def _family_signature(model_id: str) -> str:
+    """Drop version/date/qualifier tokens so variants of one family share a key.
+
+    gpt-5.4-mini and gpt-5-mini both -> 'gpt-mini'; claude-opus-4-7 -> 'claude-opus'.
+    """
+    parts = [
+        tok
+        for tok in model_id.lower().split("-")
+        if not (_VERSION_TOKEN_RE.match(tok) or _DATEISH_TOKEN_RE.match(tok) or tok in _QUALIFIER_TOKENS)
+    ]
+    return "-".join(parts)
+
+
+def _version_key(model_id: str) -> tuple[float, ...]:
+    """Comparable numeric version (gpt-5.4-mini -> (5.4,), claude-opus-4-7 -> (4.7,))."""
+    nums = re.findall(r"\d+(?:\.\d+)?", model_id.lower().replace("-", "."))
+    return tuple(float(n) for n in nums) or (0.0,)
+
+
+def classify_new_models(
+    new_models: list[DiscoveredModel],
+    registry: dict[str, RegistryModel],
+) -> tuple[list[DiscoveredModel], list[DiscoveredModel]]:
+    """Split discovered-but-unregistered models into (relevant, other).
+
+    Relevant = a text model sharing a (provider, family) with the registry, that
+    is either a higher version than what we have, or the GA release of a family
+    we currently only have in preview. Everything else (brand-new families,
+    older versions, non-text variants) goes to 'other'.
+    """
+    fam_max: dict[tuple[str, str], tuple[float, ...]] = {}
+    reg_canon: set[tuple[str, str]] = set()
+    reg_preview_bases: set[tuple[str, str]] = set()
+    for rm in registry.values():
+        fam = (rm.provider, _family_signature(rm.model))
+        v = _version_key(rm.model)
+        fam_max[fam] = max(v, fam_max.get(fam, v))
+        canon = _canonical_model(rm.model)
+        reg_canon.add((rm.provider, canon))
+        if canon.endswith("-preview"):
+            reg_preview_bases.add((rm.provider, canon[: -len("-preview")]))
+
+    relevant, other = [], []
+    for m in new_models:
+        if not _is_text_model(m.model_id):
+            other.append(m)
+            continue
+        mc = _canonical_model(m.model_id)
+        # GA promotion: registry has <model>-preview but not the GA <model>.
+        is_ga_promotion = (
+            "preview" not in mc and (m.provider, mc) in reg_preview_bases and (m.provider, mc) not in reg_canon
+        )
+        fam = (m.provider, _family_signature(m.model_id))
+        is_newer = fam in fam_max and _version_key(m.model_id) > fam_max[fam]
+        if is_newer or is_ga_promotion:
+            relevant.append(m)
+        else:
+            other.append(m)
+    return relevant, other
+
+
+def _registry_stub(m: DiscoveredModel) -> str:
+    """Render a ready-to-paste ModelCapability stub for a discovered model.
+
+    Pricing (which provider APIs don't expose) and tuning fields are left as
+    TODOs to fill from the provider's pricing page before the entry goes live —
+    wrong pricing silently corrupts budget enforcement and the cost ledger.
+    """
+    cw = m.context_window or 0
+    return (
+        f'    "{m.provider}/{m.model_id}": ModelCapability(\n'
+        f'        provider="{m.provider}",\n'
+        f'        model="{m.model_id}",\n'
+        f"        cost_per_query=0.0,  # TODO: estimate per-query cost\n"
+        f"        latency_ms=2000,  # TODO: measure\n"
+        f"        context_window={cw if cw else 'TODO'},\n"
+        f"        specializations=[],  # TODO\n"
+        f"        strengths=[],  # TODO\n"
+        f"        weaknesses=[],\n"
+        f"        input_cost_per_1m=0.0,  # TODO: from provider pricing page\n"
+        f"        output_cost_per_1m=0.0,  # TODO: from provider pricing page\n"
+        f"    ),"
+    )
+
+
+def print_comparison_report(
+    report: dict,
+    registry: dict[str, RegistryModel] | None = None,
+    show_all: bool = False,
+    emit_stubs: bool = True,
+):
+    """Print a human-readable comparison report.
+
+    By default (when ``registry`` is provided and ``show_all`` is False), only
+    "relevant" new models are highlighted: newer versions of model families we
+    already use. Pass ``show_all=True`` to list every discovered-but-unregistered
+    model (including older versions, non-text variants, and brand-new families).
+    """
     new_models = report["new_models"]
     pricing_changes = report["pricing_changes"]
     in_registry = report["in_registry"]
     registry_only = report["registry_only"]
 
+    if registry is not None and not show_all:
+        relevant, other = classify_new_models(new_models, registry)
+        highlighted = relevant
+        other_count = len(other)
+    else:
+        highlighted = new_models
+        other_count = 0
+
     # New models
-    if new_models:
-        print(f"\n  NEW MODELS AVAILABLE ({len(new_models)}):")
+    if highlighted:
+        label = "NEW RELEVANT MODELS" if (registry is not None and not show_all) else "NEW MODELS AVAILABLE"
+        print(f"\n  {label} ({len(highlighted)}):")
         print("  " + "─" * 66)
-        for m in new_models:
+        for m in highlighted:
             pricing = ""
             if m.input_cost_per_1m > 0:
                 pricing = f"  ${m.input_cost_per_1m:.2f}/${m.output_cost_per_1m:.2f} per MTok"
@@ -648,6 +822,9 @@ def print_comparison_report(report: dict):
             print(f"    + {m.provider}/{m.model_id}{pricing}{ctx}{notes}")
         print()
 
+    if other_count:
+        print(f"  ({other_count} other discovered models hidden — older versions, new families, or non-text. Use --all.)\n")
+
     # Pricing changes
     if pricing_changes:
         print(f"\n  PRICING CHANGES ({len(pricing_changes)}):")
@@ -661,7 +838,11 @@ def print_comparison_report(report: dict):
     # Summary
     print("  SUMMARY:")
     print(f"    In registry:      {len(in_registry)}")
-    print(f"    New available:    {len(new_models)}")
+    if registry is not None and not show_all:
+        print(f"    New relevant:     {len(highlighted)}")
+        print(f"    Other (hidden):   {other_count}")
+    else:
+        print(f"    New available:    {len(new_models)}")
     print(f"    Pricing changes:  {len(pricing_changes)}")
     print(f"    Registry-only:    {len(registry_only)}")
 
@@ -670,7 +851,14 @@ def print_comparison_report(report: dict):
         for rm in sorted(registry_only, key=lambda x: x.key):
             print(f"    ? {rm.key}")
 
-    if not new_models and not pricing_changes:
+    # Draft registry stubs for the relevant new models.
+    if emit_stubs and highlighted:
+        print("\n  SUGGESTED REGISTRY ENTRIES (fill in TODOs, then paste into registry.py):")
+        print("  " + "─" * 66)
+        for m in highlighted:
+            print(_registry_stub(m))
+
+    if not highlighted and not pricing_changes:
         print("\n  Registry is up to date!")
 
 
@@ -683,11 +871,26 @@ def _format_context(tokens: int) -> str:
     return str(tokens)
 
 
-def output_json(registry: dict[str, RegistryModel], report: dict):
-    """Output report as JSON."""
+def output_json(
+    registry: dict[str, RegistryModel],
+    report: dict,
+    registry_obj: dict[str, RegistryModel] | None = None,
+    show_all: bool = False,
+):
+    """Output report as JSON.
+
+    When a registry is supplied and ``show_all`` is False, ``new_models`` holds
+    only the relevant (newer-version-of-a-known-family) models and
+    ``new_models_other`` holds the rest, so CI can alert on the relevant set.
+    """
+    if registry_obj is not None and not show_all:
+        relevant, other = classify_new_models(report["new_models"], registry_obj)
+    else:
+        relevant, other = report["new_models"], []
     result = {
         "registry_count": len(registry),
-        "new_models": [asdict(m) for m in report["new_models"]],
+        "new_models": [asdict(m) for m in relevant],
+        "new_models_other": [asdict(m) for m in other],
         "pricing_changes": report["pricing_changes"],
         "matched": len(report["in_registry"]),
         "registry_only": [asdict(m) for m in report["registry_only"]],
@@ -699,6 +902,15 @@ def output_json(registry: dict[str, RegistryModel], report: dict):
 
 
 def main():
+    # The report uses box-drawing characters (U+2500 etc.). On Windows the
+    # default console encoding (cp1252) can't encode them and the script
+    # crashes mid-report. Force UTF-8 output where the stream supports it.
+    if hasattr(sys.stdout, "reconfigure"):
+        try:
+            sys.stdout.reconfigure(encoding="utf-8")
+        except (ValueError, OSError):
+            pass
+
     parser = argparse.ArgumentParser(
         description="Discover available AI models and compare against Deepr's registry.",
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -739,6 +951,16 @@ def main():
         action="store_true",
         help="Show current registry and exit",
     )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Show every discovered model (default: only newer versions of families already in the registry)",
+    )
+    parser.add_argument(
+        "--no-stubs",
+        action="store_true",
+        help="Do not print suggested registry entry stubs for new relevant models",
+    )
     parser.add_argument(
         "--verbose",
         "-v",
@@ -791,9 +1013,9 @@ def main():
 
     # Output
     if args.format == "json":
-        output_json(registry, report)
+        output_json(registry, report, registry_obj=registry, show_all=args.all)
     else:
-        print_comparison_report(report)
+        print_comparison_report(report, registry=registry, show_all=args.all, emit_stubs=not args.no_stubs)
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/test_providers/test_grok_provider.py b/tests/unit/test_providers/test_grok_provider.py
index e1ccb9c..89df1b0 100644
--- a/tests/unit/test_providers/test_grok_provider.py
+++ b/tests/unit/test_providers/test_grok_provider.py
@@ -380,3 +380,31 @@ async def test_cancel_job(self, provider):
             # but the method should still be callable
             result = await provider.cancel_job(job_id)
             assert isinstance(result, bool)
+
+
+class TestGrokHyphenatedRegistryForms:
+    """Regression: registry keys use hyphenated grok-4-20-* but the provider
+    mappings/pricing were keyed only by the dotted API form. A routed
+    "grok-4-20-reasoning" then went unmapped (wrong API id) and fell to the
+    grok-4-1-fast default price (~11x undercharge: $0.7 vs $8.0 per 1M/1M).
+    """
+
+    @pytest.fixture
+    def provider(self):
+        return GrokProvider(api_key="test-xai-key")
+
+    def test_hyphenated_forms_map_to_api_ids(self, provider):
+        assert provider.get_model_name("grok-4-20-reasoning") == "grok-4.20-0309-reasoning"
+        assert provider.get_model_name("grok-4-20-non-reasoning") == "grok-4.20-0309-non-reasoning"
+        assert provider.get_model_name("grok-4-20-multi-agent") == "grok-4.20-multi-agent-0309"
+
+    def test_hyphenated_forms_priced_at_flagship_rate(self, provider):
+        # $2/$6 per MTok -> $8.0 for 1M in + 1M out; NOT the $0.70 fast default
+        for model in ("grok-4-20-reasoning", "grok-4-20-non-reasoning", "grok-4-20-multi-agent"):
+            cost = provider._calculate_cost(1_000_000, 1_000_000, model)
+            assert cost == 8.0, f"{model} mispriced: {cost}"
+
+    def test_dotted_and_hyphenated_prices_match(self, provider):
+        assert provider._calculate_cost(1_000_000, 1_000_000, "grok-4-20-reasoning") == provider._calculate_cost(
+            1_000_000, 1_000_000, "grok-4.20-0309-reasoning"
+        )
diff --git a/tests/unit/test_providers/test_registry.py b/tests/unit/test_providers/test_registry.py
index 87f69bc..6935b45 100644
--- a/tests/unit/test_providers/test_registry.py
+++ b/tests/unit/test_providers/test_registry.py
@@ -6,6 +6,7 @@
     MODEL_CAPABILITIES,
     ModelCapability,
     get_cheapest_model,
+    get_cost_estimate,
     get_fastest_model,
     get_largest_context_model,
     get_model_capability,
@@ -175,5 +176,42 @@ def test_cost_latency_tradeoff(self):
         assert deep_research.latency_ms > fastest.latency_ms
 
 
+class TestCostEstimateMatching:
+    """Regression tests: get_cost_estimate must resolve the most specific model.
+
+    A prior first-substring-match fallback resolved snapshot/variant strings to
+    the shorter, wrong family member — over-charging (mini -> full price) and,
+    worse, under-charging (gpt-5.4-pro-<date> -> cheaper gpt-5.4), which lets
+    budget pre-flight approve an expensive job against an underestimate.
+    """
+
+    def test_exact_keys(self):
+        assert get_cost_estimate("gpt-5.4") == 0.30
+        assert get_cost_estimate("gpt-5.4-mini") == 0.05
+        assert get_cost_estimate("gpt-5.4-nano") == 0.01
+
+    def test_snapshot_resolves_to_specific_model_not_prefix(self):
+        # mini/nano snapshots must NOT resolve to the more expensive gpt-5.4
+        assert get_cost_estimate("gpt-5.4-mini-2026-03-17") == 0.05
+        assert get_cost_estimate("gpt-5.4-nano-2026-03-17") == 0.01
+
+    def test_pro_snapshot_does_not_underestimate(self):
+        # The dangerous direction: must resolve to the pro price, not gpt-5.4
+        assert get_cost_estimate("gpt-5.4-pro-2026-03-05") == 0.90
+        assert get_cost_estimate("gpt-5.5-pro-2026-04-23") == 1.50
+
+    def test_dotted_and_hyphenated_grok_equivalent(self):
+        # Normalization: dotted API form and hyphenated registry form match
+        assert get_cost_estimate("grok-4.3") == get_cost_estimate("grok-4-3")
+
+    def test_tiered_pricing_preserved(self):
+        base = get_cost_estimate("gemini-3.1-pro-preview")
+        tiered = get_cost_estimate("gemini-3.1-pro-preview", input_tokens=300_000)
+        assert tiered == round(base * 2.0, 4)
+
+    def test_unknown_model_returns_default(self):
+        assert get_cost_estimate("totally-made-up-model-xyz") == 0.20
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])