From 98f962dc19a0b5e83ab00b09e21bd90f264ae7ff Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Thu, 14 May 2026 17:47:18 +0800 Subject: [PATCH 1/2] fix(auto-run): unwedge experiment loop when provider rejects `temperature` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PRJ-0002 (2026-05-14) wedged on `azure/anthropic/claude-opus-4-7` returning `invalid_request_error: \`temperature\` is deprecated for this model.` Every auto-run round re-hit the same parameter error until `_AUTO_MAX_ROUNDS` (20) was exhausted, surfacing the error to the user instead of any research result. Three independent bugs collided: 1. `temperature` was unconditionally attached to outbound requests for models that no longer accept it. Centralise the rule in a new `providers.model_compat` registry (currently lists `claude-opus-4-7`) and gate temperature emission on it in the Azure, LiteLLM, and Anthropic providers. Azure's existing `_supports_temperature` rule for `gpt-5`/`o*` deployments is preserved on top. 2. `RoutedProviderManager.chat` blindly walked the fallback chain when a provider RAISED rather than returned an error response, so a permanent 4xx burned every remaining candidate. Apply the same `_should_retry_with_fallback` classification used on the response path; non-retryable exceptions now short-circuit immediately. 3. `_evaluate_continuation` only stopped auto mode on failure responses when `strictHeuristics` was on. LLM-provider errors are NOT experiment outcomes — they mean the model never produced a turn, so the next round will hit the same error. Add an unconditional `_looks_like_llm_provider_error` check that halts auto mode with `stop_reason="llm provider error"` regardless of policy. Tests cover the model_compat blocklist under every provider prefix, the Azure body builder dropping temperature for `claude-opus-4-7`, non-retryable raised exceptions not burning the fallback chain, and auto-run halting on the exact error text observed in PRJ-0002. --- mira_engine/agent/research_loop.py | 47 +++++++++ mira_engine/agent/routing.py | 16 ++- mira_engine/providers/anthropic_provider.py | 14 ++- .../providers/azure_openai_provider.py | 14 ++- mira_engine/providers/litellm_provider.py | 9 ++ mira_engine/providers/model_compat.py | 44 +++++++++ tests/providers/test_azure_openai_provider.py | 32 ++++++ tests/providers/test_model_compat.py | 39 ++++++++ tests/test_model_routing.py | 98 +++++++++++++++++++ tests/test_research_loop_core.py | 51 ++++++++++ 10 files changed, 358 insertions(+), 6 deletions(-) create mode 100644 mira_engine/providers/model_compat.py create mode 100644 tests/providers/test_model_compat.py diff --git a/mira_engine/agent/research_loop.py b/mira_engine/agent/research_loop.py index 8829a7b..f3a3705 100644 --- a/mira_engine/agent/research_loop.py +++ b/mira_engine/agent/research_loop.py @@ -321,6 +321,45 @@ def _looks_like_failure_response(cls, text: str | None) -> bool: ) return any(k in tail for k in soft_signals) + @classmethod + def _looks_like_llm_provider_error(cls, text: str | None) -> bool: + """Detect when ``final_content`` is a system-level LLM call failure. + + These are surfaced by provider error handlers (``_handle_error`` in + ``anthropic_provider`` / ``azure_openai_provider`` / etc., and the + chain-failure path in ``RoutedProviderManager.chat``) and must always + halt auto mode — they are NOT experiment outcomes. Without this + guard a parameter-level 4xx (e.g. Azure dropping ``temperature``) + gets retried every round until ``_AUTO_MAX_ROUNDS`` (20) is + exhausted, which is what we saw in PRJ-0002 on 2026-05-14. + + Unlike :meth:`_looks_like_failure_response`, this check fires + regardless of ``strictHeuristics`` because the agent has not even + produced a turn — there is nothing to iterate on. + """ + if not text: + return False + lowered = text.lower() + markers = ( + # Provider wrappers (see anthropic_provider._handle_error, + # azure_openai_provider._handle_error, openai_compat_provider, + # litellm_provider.chat, openai_codex_provider). + "error calling llm", + "error calling azure openai", + "error calling codex", + "error calling github copilot", + # Underlying SDK / gateway error types. + "litellm.badrequesterror", + "azure_aiexception", + "invalid_request_error", + "bad_request_error", + # RoutedProviderManager terminal message. + "all candidate models failed for this turn", + # base_loop fallback when an error response has no content. + "sorry, i encountered an error calling the ai model", + ) + return any(marker in lowered for marker in markers) + # ------------------------------------------------------------------ # task_plan loaders / inspectors # ------------------------------------------------------------------ @@ -874,6 +913,14 @@ def _evaluate_continuation( """ if run_mode != "auto": return False, None + # LLM provider errors must halt the loop unconditionally — the agent + # never even produced a turn, so the next round will hit the exact + # same failure (parameter rejected, auth invalid, gateway down, ...). + # Without this guard a single bad request burns through all 20 + # auto-run rounds before surfacing the error to the user. + if self._looks_like_llm_provider_error(final_content): + logger.warning("Auto mode halting: LLM provider error in final response") + return False, "llm provider error" if not self._guard_task_plan_structure(project_dir, profile=agent_profile): return False, "task_plan guardrail blocking" if auto_round >= self._AUTO_MAX_ROUNDS: diff --git a/mira_engine/agent/routing.py b/mira_engine/agent/routing.py index 7c13ea0..4fb99e8 100644 --- a/mira_engine/agent/routing.py +++ b/mira_engine/agent/routing.py @@ -281,14 +281,26 @@ async def chat( except Exception as exc: last_error = exc self._mark_model_failed(model) - if index < len(candidates) - 1: + # Mirror the response-path classification so a permanent 4xx + # (auth, invalid_request_error, bad request, ...) does not + # blindly burn the rest of the fallback chain — those errors + # will fail identically on every other candidate. + exc_text = str(exc) or exc.__class__.__name__ + is_retryable = self._should_retry_with_fallback(exc_text) + if is_retryable and index < len(candidates) - 1: logger.warning( - "Model '{}' raised '{}'; trying fallback model '{}'", + "Model '{}' raised retryable error '{}'; trying fallback model '{}'", model, exc, candidates[index + 1], ) continue + if not is_retryable: + logger.warning( + "Model '{}' raised non-retryable error '{}'; skipping fallback candidates", + model, + exc, + ) raise if response.finish_reason != "error" or not self._should_retry_with_fallback(response.content): diff --git a/mira_engine/providers/anthropic_provider.py b/mira_engine/providers/anthropic_provider.py index 07cb001..863a442 100644 --- a/mira_engine/providers/anthropic_provider.py +++ b/mira_engine/providers/anthropic_provider.py @@ -13,6 +13,7 @@ import json_repair from mira_engine.providers.base import LLMProvider, LLMResponse, ToolCallRequest +from mira_engine.providers.model_compat import model_supports_temperature _ALNUM = string.ascii_letters + string.digits @@ -380,19 +381,26 @@ def _build_kwargs( if system: kwargs["system"] = system + # Some models (e.g. claude-opus-4-7 fronted by Azure AI) reject + # `temperature` outright with `invalid_request_error`. Resolve this + # once via the shared registry so every code path agrees. + temperature_allowed = model_supports_temperature(model) and model_supports_temperature(model_name) + if reasoning_effort == "adaptive": # Adaptive thinking: model decides when and how much to think # Supported on claude-sonnet-4-6 and claude-opus-4-6. # Also auto-enables interleaved thinking between tool calls. kwargs["thinking"] = {"type": "adaptive"} - kwargs["temperature"] = 1.0 + if temperature_allowed: + kwargs["temperature"] = 1.0 elif thinking_enabled: budget_map = {"low": 1024, "medium": 4096, "high": max(8192, max_tokens)} budget = budget_map.get(reasoning_effort.lower(), 4096) kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget} kwargs["max_tokens"] = max(max_tokens, budget + 4096) - kwargs["temperature"] = 1.0 - else: + if temperature_allowed: + kwargs["temperature"] = 1.0 + elif temperature_allowed: kwargs["temperature"] = temperature if anthropic_tools: diff --git a/mira_engine/providers/azure_openai_provider.py b/mira_engine/providers/azure_openai_provider.py index 5c52fa8..7b15ac6 100644 --- a/mira_engine/providers/azure_openai_provider.py +++ b/mira_engine/providers/azure_openai_provider.py @@ -9,6 +9,7 @@ from openai import AsyncOpenAI from mira_engine.providers.base import LLMProvider, LLMResponse +from mira_engine.providers.model_compat import model_supports_temperature from mira_engine.providers.openai_responses import ( consume_sdk_stream, convert_messages, @@ -55,9 +56,20 @@ def _supports_temperature( deployment_name: str, reasoning_effort: str | None = None, ) -> bool: - """Return True when temperature is likely supported for this deployment.""" + """Return True when temperature is likely supported for this deployment. + + Combines two rule sets: + - Azure-hosted OpenAI reasoning deployments (``gpt-5``, ``o1``, + ``o3``, ``o4``) and any call passing ``reasoning_effort`` drop + ``temperature``. + - Models flagged in :mod:`providers.model_compat` (e.g. + Azure-hosted ``claude-opus-4-7``) drop ``temperature`` regardless + of deployment prefix. + """ if reasoning_effort: return False + if not model_supports_temperature(deployment_name): + return False name = deployment_name.lower() return not any(token in name for token in ("gpt-5", "o1", "o3", "o4")) diff --git a/mira_engine/providers/litellm_provider.py b/mira_engine/providers/litellm_provider.py index 9c5d7bc..08f1368 100644 --- a/mira_engine/providers/litellm_provider.py +++ b/mira_engine/providers/litellm_provider.py @@ -12,6 +12,7 @@ from loguru import logger from mira_engine.providers.base import LLMProvider, LLMResponse, ToolCallRequest +from mira_engine.providers.model_compat import model_supports_temperature from mira_engine.providers.registry import find_by_model, find_gateway # Standard chat-completion message keys. @@ -247,6 +248,14 @@ async def chat( "temperature": temperature, } + # Strip `temperature` for models that reject it (e.g. + # ``azure/anthropic/claude-opus-4-7`` returns + # ``invalid_request_error: \`temperature\` is deprecated for this model.``). + # Done before _apply_model_overrides so a registry override can still + # re-add it intentionally if some future provider needs that. + if not model_supports_temperature(original_model) or not model_supports_temperature(model): + kwargs.pop("temperature", None) + # Apply model-specific overrides (e.g. kimi-k2.5 temperature) self._apply_model_overrides(model, kwargs) diff --git a/mira_engine/providers/model_compat.py b/mira_engine/providers/model_compat.py new file mode 100644 index 0000000..6ddb1a3 --- /dev/null +++ b/mira_engine/providers/model_compat.py @@ -0,0 +1,44 @@ +r"""Model-level compatibility flags shared across providers. + +Some newer LLMs (Anthropic Claude Opus 4-7, several OpenAI reasoning +deployments, etc.) reject the ``temperature`` parameter outright. The +property is **model-scoped**, not provider-scoped: the same model is +exposed by Anthropic directly, by Azure AI's Anthropic deployment, by +AiHubMix/OpenRouter gateways, etc., and every path must drop the +parameter or the request 400s. + +Keeping the rule in one place avoids the failure mode we observed in +production where ``azure/anthropic/claude-opus-4-7`` repeatedly errored +with ``\`temperature\` is deprecated for this model.`` while every +provider builder still attached ``temperature``. + +To extend, add a substring token to ``TEMPERATURE_UNSUPPORTED_MODEL_TOKENS``. +""" + +from __future__ import annotations + +# Substrings (case-insensitive) that identify models which reject +# `temperature`. Match is intentionally loose so it catches the model +# under every provider/gateway prefix (``anthropic/...``, +# ``azure/anthropic/...``, ``openrouter/anthropic/...``, etc.). +TEMPERATURE_UNSUPPORTED_MODEL_TOKENS: frozenset[str] = frozenset( + { + # Claude Opus 4.x on Azure AI rejects `temperature` with + # `invalid_request_error: \`temperature\` is deprecated for this model.` + # The same model on the native Anthropic API still accepts it today, + # but stripping it everywhere is safe (Anthropic defaults to 1.0). + "claude-opus-4-7", + } +) + + +def model_supports_temperature(model: str | None) -> bool: + """Return True when the model is expected to accept ``temperature``. + + Empty / unknown model strings default to True so we don't accidentally + suppress the parameter for ordinary models. + """ + if not model: + return True + lowered = model.lower() + return not any(token in lowered for token in TEMPERATURE_UNSUPPORTED_MODEL_TOKENS) diff --git a/tests/providers/test_azure_openai_provider.py b/tests/providers/test_azure_openai_provider.py index 2af6b0c..615466e 100644 --- a/tests/providers/test_azure_openai_provider.py +++ b/tests/providers/test_azure_openai_provider.py @@ -78,6 +78,38 @@ def test_supports_temperature_with_reasoning_effort(): assert AzureOpenAIProvider._supports_temperature("gpt-4o", reasoning_effort="medium") is False +def test_supports_temperature_blocks_claude_opus_4_7(): + """Bug 1 regression: Azure-hosted ``claude-opus-4-7`` deployments reject + ``temperature`` outright. The blocklist (via providers.model_compat) must + catch the deployment under every prefix variant. + """ + assert AzureOpenAIProvider._supports_temperature("claude-opus-4-7") is False + assert AzureOpenAIProvider._supports_temperature("azure/anthropic/claude-opus-4-7") is False + assert AzureOpenAIProvider._supports_temperature("anthropic/claude-opus-4-7") is False + + +def test_build_body_drops_temperature_for_claude_opus_4_7(): + """The Responses API body must NOT carry ``temperature`` when the model + is on the blocklist — otherwise Azure returns ``invalid_request_error`` + and the agent's auto-run loop wedges (see PRJ-0002 incident 2026-05-14). + """ + provider = AzureOpenAIProvider( + api_key="k", + api_base="https://r.openai.azure.com", + default_model="claude-opus-4-7", + ) + body = provider._build_body( + [{"role": "user", "content": "hi"}], + None, + "azure/anthropic/claude-opus-4-7", + 4096, + 0.7, + None, + None, + ) + assert "temperature" not in body + + # --------------------------------------------------------------------------- # _build_body — Responses API body construction # --------------------------------------------------------------------------- diff --git a/tests/providers/test_model_compat.py b/tests/providers/test_model_compat.py new file mode 100644 index 0000000..37c8e9b --- /dev/null +++ b/tests/providers/test_model_compat.py @@ -0,0 +1,39 @@ +"""Tests for the shared model-compatibility helper.""" + +from __future__ import annotations + +from mira_engine.providers.model_compat import ( + TEMPERATURE_UNSUPPORTED_MODEL_TOKENS, + model_supports_temperature, +) + + +def test_ordinary_models_support_temperature(): + assert model_supports_temperature("anthropic/claude-sonnet-4-5") is True + assert model_supports_temperature("openai/gpt-4o") is True + assert model_supports_temperature("gpt-4.1-mini") is True + + +def test_empty_or_none_model_defaults_to_supported(): + # We don't want to silently strip temperature for unknown models — + # only an explicit token match disables it. + assert model_supports_temperature(None) is True + assert model_supports_temperature("") is True + + +def test_claude_opus_4_7_is_blocked_under_every_provider_prefix(): + # Bug 1: in production we observed `azure/anthropic/claude-opus-4-7` + # rejecting `temperature` with + # invalid_request_error: `temperature` is deprecated for this model. + # The token rule must catch the model under every prefix variant. + assert model_supports_temperature("claude-opus-4-7") is False + assert model_supports_temperature("anthropic/claude-opus-4-7") is False + assert model_supports_temperature("azure/anthropic/claude-opus-4-7") is False + assert model_supports_temperature("openrouter/anthropic/claude-opus-4-7") is False + assert model_supports_temperature("Azure/Anthropic/Claude-Opus-4-7") is False + + +def test_blocklist_is_a_frozenset(): + # Guards against accidental in-place mutation from another module. + assert isinstance(TEMPERATURE_UNSUPPORTED_MODEL_TOKENS, frozenset) + assert "claude-opus-4-7" in TEMPERATURE_UNSUPPORTED_MODEL_TOKENS diff --git a/tests/test_model_routing.py b/tests/test_model_routing.py index 314747e..c4e6e67 100644 --- a/tests/test_model_routing.py +++ b/tests/test_model_routing.py @@ -282,6 +282,104 @@ async def test_routing_prefers_recently_successful_routing_model() -> None: assert broken.calls == 1 +class _RaisingProvider(LLMProvider): + """Provider that always raises the same exception.""" + + def __init__(self, exc: Exception): + super().__init__() + self.exc = exc + self.calls = 0 + + async def chat( + self, + messages: list[dict[str, Any]], + tools: list[dict[str, Any]] | None = None, + model: str | None = None, + max_tokens: int = 4096, + temperature: float = 0.7, + reasoning_effort: str | None = None, + ) -> LLMResponse: + self.calls += 1 + raise self.exc + + def get_default_model(self) -> str: + return "anthropic/claude-opus-4-5" + + +async def test_chat_falls_back_on_retryable_raised_exception() -> None: + """Existing behaviour preserved: a raised retryable exception should + still walk through to the next fallback candidate.""" + raising = _RaisingProvider(TimeoutError("Request timed out")) + healthy = _FakeProvider() + providers = { + "openai/gpt-4.1-mini": raising, + "openai/gpt-4.1-nano": healthy, + } + manager = RoutedProviderManager( + default_provider=_FakeProvider(), + default_model="anthropic/claude-opus-4-5", + router=None, + provider_factory=lambda model: providers[model], + ) + + response, resolved_route = await manager.chat( + route=RoutedModel( + tier="small", + model="openai/gpt-4.1-mini", + candidates=("openai/gpt-4.1-mini", "openai/gpt-4.1-nano"), + source="test", + ), + messages=[{"role": "user", "content": "hello"}], + ) + + assert response.content == "ok" + assert resolved_route.model == "openai/gpt-4.1-nano" + assert raising.calls == 1 + + +async def test_chat_does_not_fallback_on_non_retryable_raised_exception() -> None: + """Bug 2 regression: when a provider RAISES (rather than returns an error + response) with a permanent 4xx like ``invalid_request_error``, the manager + must NOT silently burn the remaining fallback chain — the next candidate + will fail identically. The exception is re-raised after marking the + model failed. + """ + permanent = _RaisingProvider( + RuntimeError("400 invalid_request_error: `temperature` is deprecated for this model.") + ) + other = _RaisingProvider(RuntimeError("should not be called")) + providers = { + "openai/gpt-4.1-mini": permanent, + "openai/gpt-4.1-nano": other, + } + manager = RoutedProviderManager( + default_provider=_FakeProvider(), + default_model="anthropic/claude-opus-4-5", + router=None, + provider_factory=lambda model: providers[model], + ) + + raised: Exception | None = None + try: + await manager.chat( + route=RoutedModel( + tier="small", + model="openai/gpt-4.1-mini", + candidates=("openai/gpt-4.1-mini", "openai/gpt-4.1-nano"), + source="test", + ), + messages=[{"role": "user", "content": "hello"}], + ) + except Exception as exc: + raised = exc + + assert raised is not None + assert "invalid_request_error" in str(raised) + assert permanent.calls == 1 + # Fallback candidate must NOT have been invoked. + assert other.calls == 0 + + async def test_chat_reports_error_when_all_candidate_models_fail() -> None: manager = RoutedProviderManager( default_provider=_FakeProvider(), diff --git a/tests/test_research_loop_core.py b/tests/test_research_loop_core.py index 535fa32..fd57a9c 100644 --- a/tests/test_research_loop_core.py +++ b/tests/test_research_loop_core.py @@ -245,6 +245,57 @@ def test_auto_run_decision_helpers(tmp_path: Path) -> None: auto_round=0, ) is False + # Bug 3 regression: when the LLM call itself fails (parameter rejected, + # gateway down, all candidates exhausted) the final_content is a system + # error and auto-run must halt unconditionally — even when there is + # pending work in the plan and strictHeuristics is the default. Without + # this guard, a single bad request burns all 20 auto rounds repeatedly + # hitting the same error (see PRJ-0002 incident 2026-05-14). + assert ResearchAgentLoop._looks_like_llm_provider_error( + "Error calling LLM: litellm.BadRequestError: Azure_aiException - " + "{\"type\":\"error\",\"error\":{\"type\":\"invalid_request_error\"," + "\"message\":\"`temperature` is deprecated for this model.\"}}" + ) is True + assert ResearchAgentLoop._looks_like_llm_provider_error( + "Error calling Azure OpenAI: Connection failed" + ) is True + assert ResearchAgentLoop._looks_like_llm_provider_error( + "All candidate models failed for this turn. Last error from " + "'azure/anthropic/claude-opus-4-7': Error calling LLM: Request timed out." + ) is True + # Ordinary experiment outputs that happen to contain the word "error" + # must NOT trigger this guard. + assert ResearchAgentLoop._looks_like_llm_provider_error( + "实验完成,error rate 降至 0.03。继续下一组消融。" + ) is False + assert ResearchAgentLoop._looks_like_llm_provider_error(None) is False + + decision, reason = loop._evaluate_continuation( + run_mode="auto", + project_dir=str(project), + final_content=( + "Error: {'message': 'litellm.BadRequestError: Azure_aiException - " + "{\"type\":\"error\",\"error\":{\"type\":\"invalid_request_error\"," + "\"message\":\"`temperature` is deprecated for this model.\"}}'}" + ), + auto_round=1, + ) + assert decision is False + assert reason == "llm provider error" + + # Even with strictHeuristics disabled (the production config from PRJ-0002 + # had no automation_policy, which defaults to relaxed heuristics) the + # LLM provider error must still halt. + decision, reason = loop._evaluate_continuation( + run_mode="auto", + project_dir=str(project), + final_content="All candidate models failed for this turn. Last error from 'gpt-5.5': Error calling LLM: Request timed out.", + auto_round=1, + automation_policy=relaxed_policy, + ) + assert decision is False + assert reason == "llm provider error" + exhausted_policy = loop._parse_automation_policy( { "logic": "AND", From 2dd914085bc691a0e764180d50cddce84cb3c389 Mon Sep 17 00:00:00 2001 From: Chenglong Wang Date: Thu, 14 May 2026 21:33:16 +0800 Subject: [PATCH 2/2] fix(providers): drop temperature for claude-opus-4-7 on OpenAICompatProvider too Second occurrence at 2026-05-14 21:09: the temperature error reappeared even after the first PR fix. Root cause: `OpenAICompatProvider` (used by `custom` provider configs and by `GitHubCopilotProvider` via inheritance) keeps its own `_supports_temperature` rule that only blocked GPT-5 / o1 / o3 / o4 deployments. When a user's OpenAI-compatible endpoint proxies to Azure-hosted `claude-opus-4-7`, this path still attached `temperature` and Azure 400'd with `invalid_request_error: \`temperature\` is deprecated for this model.` Have `_supports_temperature` also consult the shared `providers.model_compat` registry. Same pattern as Azure / LiteLLM / Anthropic providers from the parent commit. The error-format trail (`Error: {'message':...}`) comes from `_handle_error` in `openai_compat_provider.py:811`, which confirms this code path is the one the user's config hits. Adds two regression tests: - `_supports_temperature` returns False for `claude-opus-4-7` under every provider prefix. - `_build_kwargs` AND `_build_responses_body` both omit `temperature` from the outbound request body for `azure/anthropic/claude-opus-4-7`. --- mira_engine/agent/research_loop.py | 2 +- .../providers/openai_compat_provider.py | 11 +++- tests/providers/test_azure_openai_provider.py | 2 +- tests/providers/test_litellm_kwargs.py | 53 +++++++++++++++++++ tests/test_research_loop_core.py | 7 ++- 5 files changed, 67 insertions(+), 8 deletions(-) diff --git a/mira_engine/agent/research_loop.py b/mira_engine/agent/research_loop.py index f3a3705..28cbd20 100644 --- a/mira_engine/agent/research_loop.py +++ b/mira_engine/agent/research_loop.py @@ -331,7 +331,7 @@ def _looks_like_llm_provider_error(cls, text: str | None) -> bool: halt auto mode — they are NOT experiment outcomes. Without this guard a parameter-level 4xx (e.g. Azure dropping ``temperature``) gets retried every round until ``_AUTO_MAX_ROUNDS`` (20) is - exhausted, which is what we saw in PRJ-0002 on 2026-05-14. + exhausted. Unlike :meth:`_looks_like_failure_response`, this check fires regardless of ``strictHeuristics`` because the agent has not even diff --git a/mira_engine/providers/openai_compat_provider.py b/mira_engine/providers/openai_compat_provider.py index 4b4a051..54a7026 100644 --- a/mira_engine/providers/openai_compat_provider.py +++ b/mira_engine/providers/openai_compat_provider.py @@ -26,6 +26,7 @@ from openai import AsyncOpenAI from mira_engine.providers.base import LLMProvider, LLMResponse, ToolCallRequest +from mira_engine.providers.model_compat import model_supports_temperature from mira_engine.providers.openai_responses import ( consume_sdk_stream, convert_messages, @@ -263,11 +264,17 @@ def _supports_temperature( ) -> bool: """Return True when the model accepts a temperature parameter. - GPT-5 family and reasoning models (o1/o3/o4) reject temperature - when reasoning_effort is set to anything other than ``"none"``. + Combines two rule sets: + - GPT-5 / o1 / o3 / o4 deployments (and any non-``"none"`` + ``reasoning_effort``) reject temperature. + - Models flagged in :mod:`providers.model_compat` (e.g. + Azure-proxied ``claude-opus-4-7``) reject temperature + regardless of the OpenAI-compatible front-end. """ if reasoning_effort and reasoning_effort.lower() != "none": return False + if not model_supports_temperature(model_name): + return False name = model_name.lower() return not any(token in name for token in ("gpt-5", "o1", "o3", "o4")) diff --git a/tests/providers/test_azure_openai_provider.py b/tests/providers/test_azure_openai_provider.py index 615466e..a0fe144 100644 --- a/tests/providers/test_azure_openai_provider.py +++ b/tests/providers/test_azure_openai_provider.py @@ -91,7 +91,7 @@ def test_supports_temperature_blocks_claude_opus_4_7(): def test_build_body_drops_temperature_for_claude_opus_4_7(): """The Responses API body must NOT carry ``temperature`` when the model is on the blocklist — otherwise Azure returns ``invalid_request_error`` - and the agent's auto-run loop wedges (see PRJ-0002 incident 2026-05-14). + and the agent's auto-run loop wedges. """ provider = AzureOpenAIProvider( api_key="k", diff --git a/tests/providers/test_litellm_kwargs.py b/tests/providers/test_litellm_kwargs.py index 2df8dc5..49041a5 100644 --- a/tests/providers/test_litellm_kwargs.py +++ b/tests/providers/test_litellm_kwargs.py @@ -502,6 +502,59 @@ def test_openai_compat_supports_temperature_matches_reasoning_model_rules() -> N assert OpenAICompatProvider._supports_temperature("gpt-4o", reasoning_effort="medium") is False +def test_openai_compat_supports_temperature_blocks_claude_opus_4_7() -> None: + """Bug 1 regression: OpenAI-compatible custom endpoints that proxy to + Azure-hosted ``claude-opus-4-7`` must drop ``temperature`` — otherwise + Azure returns ``invalid_request_error: \\`temperature\\` is deprecated for + this model.``. + """ + assert OpenAICompatProvider._supports_temperature("claude-opus-4-7") is False + assert OpenAICompatProvider._supports_temperature("azure/anthropic/claude-opus-4-7") is False + assert OpenAICompatProvider._supports_temperature("anthropic/claude-opus-4-7") is False + + +def test_openai_compat_build_kwargs_drops_temperature_for_claude_opus_4_7() -> None: + spec = find_by_name("custom") + with patch("mira_engine.providers.openai_compat_provider.AsyncOpenAI"): + provider = OpenAICompatProvider( + api_key="any", + default_model="azure/anthropic/claude-opus-4-7", + spec=spec, + ) + + kwargs = provider._build_kwargs( + messages=[{"role": "user", "content": "hi"}], + tools=None, + model="azure/anthropic/claude-opus-4-7", + max_tokens=512, + temperature=0.7, + reasoning_effort=None, + tool_choice=None, + ) + assert "temperature" not in kwargs + + +def test_openai_compat_build_responses_body_drops_temperature_for_claude_opus_4_7() -> None: + spec = find_by_name("custom") + with patch("mira_engine.providers.openai_compat_provider.AsyncOpenAI"): + provider = OpenAICompatProvider( + api_key="any", + default_model="azure/anthropic/claude-opus-4-7", + spec=spec, + ) + + body = provider._build_responses_body( + messages=[{"role": "user", "content": "hi"}], + tools=None, + model="azure/anthropic/claude-opus-4-7", + max_tokens=512, + temperature=0.7, + reasoning_effort=None, + tool_choice=None, + ) + assert "temperature" not in body + + def test_openai_compat_build_kwargs_uses_gpt5_safe_parameters() -> None: spec = find_by_name("openai") with patch("mira_engine.providers.openai_compat_provider.AsyncOpenAI"): diff --git a/tests/test_research_loop_core.py b/tests/test_research_loop_core.py index fd57a9c..4b5b7a5 100644 --- a/tests/test_research_loop_core.py +++ b/tests/test_research_loop_core.py @@ -245,12 +245,12 @@ def test_auto_run_decision_helpers(tmp_path: Path) -> None: auto_round=0, ) is False - # Bug 3 regression: when the LLM call itself fails (parameter rejected, + # when the LLM call itself fails (parameter rejected, # gateway down, all candidates exhausted) the final_content is a system # error and auto-run must halt unconditionally — even when there is # pending work in the plan and strictHeuristics is the default. Without # this guard, a single bad request burns all 20 auto rounds repeatedly - # hitting the same error (see PRJ-0002 incident 2026-05-14). + # hitting the same error. assert ResearchAgentLoop._looks_like_llm_provider_error( "Error calling LLM: litellm.BadRequestError: Azure_aiException - " "{\"type\":\"error\",\"error\":{\"type\":\"invalid_request_error\"," @@ -283,8 +283,7 @@ def test_auto_run_decision_helpers(tmp_path: Path) -> None: assert decision is False assert reason == "llm provider error" - # Even with strictHeuristics disabled (the production config from PRJ-0002 - # had no automation_policy, which defaults to relaxed heuristics) the + # Even with strictHeuristics disabled the # LLM provider error must still halt. decision, reason = loop._evaluate_continuation( run_mode="auto",