From d0bf9cd64052e2edee6a584ddcef07dca514a657 Mon Sep 17 00:00:00 2001
From: Devon Kelley <devon@kalibr.systems>
Date: Fri, 22 May 2026 12:27:11 -0700
Subject: [PATCH] fix: return best attempt on heal loop exhaustion (v1.14.4)

When all heal-loop paths are exhausted, return the last attempted response
instead of raising RuntimeError. Sets kalibr_heal_exhausted=True on the
response so callers can detect the exhausted state. Only raises if no
response was ever received (e.g. network failure before any bytes arrived).

Prevents benchmarks from counting partial/low-quality results as hard
failures with model='error'.

Closes: heal-loop-exhaustion-raises
---
 CHANGELOG.md     |  6 ++++++
 kalibr/router.py | 17 ++++++++++++++++-
 pyproject.toml   |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ec0b659..4785166 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.14.4] - 2026-05-22
+
+### Fixed
+
+- **fix: heal loop returns best attempt instead of raising on exhaustion** — When all heal-loop paths are exhausted, the router now returns the last attempted response (with `kalibr_heal_exhausted = True`) instead of raising `RuntimeError`. Only raises if no response was ever received (e.g. network failure before any bytes arrived). Prevents benchmarks and callers from counting partial results as hard errors.
+
 ### Added
 
 - **Tavily Search provider** — `tavily/basic` and `tavily/advanced` as Router paths. Returns web search results wrapped in an OpenAI-compatible ChatCompletion shim so Thompson Sampling can compete Tavily against LLMs on web research goals. Set `TAVILY_API_KEY` env var.
diff --git a/kalibr/router.py b/kalibr/router.py
index 77d53dc..fb5b77a 100644
--- a/kalibr/router.py
+++ b/kalibr/router.py
@@ -925,7 +925,22 @@ def _heal_dispatch(m_id: str, msgs: List[Dict], system_prompt: Optional[str] = N
                 router_span.set_attribute("kalibr.failure_category", failure_category)
 
                 err_msg = heal_result.get("error") or f"heal loop failed: {failure_category}"
-                raise RuntimeError(f"Heal loop exhausted all paths: {err_msg}")
+
+                # Return the best attempt instead of raising — a partial response is
+                # better than an exception for benchmarks and callers that can tolerate
+                # lower-quality output. Raise only if no response was ever received
+                # (e.g. network failure before any bytes arrived).
+                best_response = heal_result.get("response")
+                if best_response is None:
+                    raise RuntimeError(f"Heal loop exhausted all paths: {err_msg}")
+
+                best_response.kalibr_trace_id = trace_id
+                best_response.kalibr_healed = True
+                best_response.kalibr_heal_exhausted = True
+                best_response.kalibr_heal_count = heal_result.get("heal_count", 0)
+                best_response.kalibr_models_tried = heal_result.get("models_tried") or []
+                best_response.kalibr_model_used = used_model
+                return best_response
 
             # Step 5: Build ordered candidate paths for fallback
             # First: intelligence-selected path, then remaining registered paths
diff --git a/pyproject.toml b/pyproject.toml
index 85f43f1..4e42ece 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "kalibr"
-version = "1.14.3"
+version = "1.14.4"
 description = "Outcome-aware LLM routing for production AI agents. Routes between models, tools, and parameters based on real success signals using Thompson Sampling. Automatic fallback, cost optimization, and continuous learning — no redeploy required."
 authors = [{name = "Kalibr Team", email = "support@kalibr.systems"}]
 readme = "README.md"