Tracer-Cloud · devankitjuneja · May 17, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/app/agent/investigation.py b/app/agent/investigation.py
@@ -11,7 +11,7 @@
 
 from app.agent.llm_invoke_errors import LLMInvokeFailure, classify_llm_invoke_failure
 from app.agent.prompt import build_system_prompt, format_alert_context
-from app.agent.result import InvestigationResult, parse_diagnosis
+from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis
 from app.cli.support.output import debug_print, get_tracker
 from app.constants.investigation import MAX_INVESTIGATION_LOOPS
 from app.services.agent_llm_client import ToolCall, get_agent_llm
@@ -237,12 +237,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
         result.evidence_entries = [e.model_dump() for e in evidence_entries]
         result.agent_messages = messages
 
+        if check_sufficiency(result):
+            if result.root_cause.startswith("Most likely:"):
+                result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
+        else:
+            if not result.root_cause.startswith("Most likely"):
+                result.root_cause = f"Most likely: {result.root_cause}"
+            # Override band set by classify_confidence_band in result.py: thin evidence means LOW
+            # regardless of LLM-reported score (a high score with zero claims is still insufficient).
+            result.confidence_band = "low"
+
         _emit(
             "agent_end",
             {
                 "root_cause": result.root_cause,
                 "validity_score": result.validity_score,
                 "root_cause_category": result.root_cause_category,
+                "confidence_band": result.confidence_band,
             },
         )
 
@@ -282,6 +293,9 @@ def _degraded_investigation_from_llm_failure(
             "root_cause": error_msg,
             "validity_score": 0.0,
             "root_cause_category": failure.root_cause_category,
+            "confidence_band": "low",
+            "ranked_hypotheses": [],
+            "missing_evidence": [],
         },
     )
     updates = {
@@ -292,6 +306,9 @@ def _degraded_investigation_from_llm_failure(
         "non_validated_claims": [],
         "remediation_steps": failure.remediation_steps,
         "validity_score": 0.0,
+        "confidence_band": "low",
+        "ranked_hypotheses": [],
+        "missing_evidence": [],
         "investigation_recommendations": [],
         "evidence": evidence,
         "evidence_entries": [e.model_dump() for e in evidence_entries],
@@ -647,6 +664,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]:
         "non_validated_claims": result.non_validated_claims,
         "remediation_steps": result.remediation_steps,
         "validity_score": result.validity_score,
+        "confidence_band": result.confidence_band,
+        "ranked_hypotheses": result.ranked_hypotheses,
+        "missing_evidence": result.missing_evidence,
         "investigation_recommendations": result.investigation_recommendations,
         "evidence": result.evidence,
         "evidence_entries": result.evidence_entries,

diff --git a/app/agent/prompt.py b/app/agent/prompt.py
@@ -36,6 +36,10 @@
 - **Non-validated claims**: Hypotheses you could not confirm
 - **Remediation steps**: Ordered, concrete actions to fix the issue
 - **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality
+- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence)
+- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first)
+- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable
+- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty
 """
 
 _ALERT_CONTEXT_TEMPLATE = """## Alert

diff --git a/app/agent/result.py b/app/agent/result.py
@@ -4,7 +4,7 @@
 
 import logging
 from dataclasses import dataclass, field
-from typing import Any, TypedDict, cast
+from typing import Any, Literal, TypedDict, cast
 
 from pydantic import BaseModel, Field
 
@@ -17,6 +17,17 @@
 logger = logging.getLogger(__name__)
 
 
+class _ValidatedClaimSchema(BaseModel):
+    claim: str = Field(description="The validated claim statement")
+    evidence_sources: list[str] = Field(
+        default_factory=list,
+        description=(
+            "Subset of the collected evidence keys that directly support this specific claim. "
+            "Only include keys that actually informed this claim."
+        ),
+    )
+
+
 @dataclass
 class InvestigationResult:
     root_cause: str
@@ -26,6 +37,9 @@ class InvestigationResult:
     non_validated_claims: list[dict] = field(default_factory=list)
     remediation_steps: list[str] = field(default_factory=list)
     validity_score: float = 0.0
+    confidence_band: str = ""
+    ranked_hypotheses: list[str] = field(default_factory=list)
+    missing_evidence: list[str] = field(default_factory=list)
     evidence: dict[str, Any] = field(default_factory=dict)
     evidence_entries: list[dict] = field(default_factory=list)
     agent_messages: list[dict] = field(default_factory=list)
@@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult:
             root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.",
             root_cause_category="unknown",
             validity_score=0.0,
+            confidence_band="low",
             non_validated_claims=[
                 {
                     "claim": "Insufficient evidence available",
@@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult:
             root_cause="Message classified as noise — no investigation needed.",
             root_cause_category="healthy",
             validity_score=1.0,
+            confidence_band="high",
         )
 
 
+def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]:
+    if score >= 0.75:
+        return "high"
+    if score >= 0.40:
+        return "medium"
+    return "low"
+
+
+def check_sufficiency(result: InvestigationResult) -> bool:
+    if result.root_cause_category in {"healthy", "unknown"}:
+        return True
+    if result.validity_score >= 0.75 and len(result.validated_claims) >= 1:
+        return True
+    return result.validity_score >= 0.40 and len(result.validated_claims) >= 2
+
+
 def parse_diagnosis(
     messages: list[dict[str, Any]],
     evidence: dict[str, Any],
@@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel):
         causal_chain: list[str] = Field(
             default_factory=list, description="Ordered steps leading to the failure"
         )
-        validated_claims: list[str] = Field(
-            default_factory=list, description="Claims supported by tool evidence"
+        validated_claims: list[_ValidatedClaimSchema] = Field(
+            default_factory=list,
+            description="Claims supported by tool evidence, each with their specific supporting evidence keys",
         )
         non_validated_claims: list[str] = Field(
             default_factory=list, description="Claims not yet confirmed by evidence"
@@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel):
         validity_score: float = Field(
             default=0.0, description="0.0–1.0 confidence in the diagnosis"
         )
+        ranked_hypotheses: list[str] = Field(
+            default_factory=list,
+            description="Alternative hypotheses ranked by likelihood (most to least likely)",
+        )
+        missing_evidence: list[str] = Field(
+            default_factory=list,
+            description="Evidence that would confirm or refute the diagnosis but was unavailable",
+        )
 
     return DiagnosisSchema
 
@@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict):
         root_cause: str
         root_cause_category: str
         causal_chain: list[str]
-        validated_claims: list[str]
+        validated_claims: list[dict]
         non_validated_claims: list[str]
         remediation_steps: list[str]
         validity_score: float
+        ranked_hypotheses: list[str]
+        missing_evidence: list[str]
 
     llm = get_llm_for_reasoning()
     schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source))
@@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]:
         root_cause=schema["root_cause"],
         root_cause_category=schema["root_cause_category"],
         causal_chain=schema["causal_chain"],
-        validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"),
+        validated_claims=[
+            {
+                "claim": c["claim"],
+                "validation_status": "validated",
+                **(
+                    {"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {}
+                ),
+            }
+            for c in schema["validated_claims"]
+            if c.get("claim")
+        ],
         non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"),
         remediation_steps=schema["remediation_steps"],
         validity_score=schema["validity_score"],
+        confidence_band=classify_confidence_band(schema["validity_score"]),
+        ranked_hypotheses=schema["ranked_hypotheses"],
+        missing_evidence=schema["missing_evidence"],
     )
 
 
@@ -207,6 +263,7 @@ def _parse_via_legacy(
             ],
             remediation_steps=rr.remediation_steps,
             validity_score=0.5,
+            confidence_band=classify_confidence_band(0.5),
         )
     except Exception as err:
         logger.warning("Legacy parse_root_cause also failed: %s", err)

diff --git a/app/cli/investigation/investigate.py b/app/cli/investigation/investigate.py
@@ -140,6 +140,7 @@ def run_investigation_cli(
         "root_cause": state["root_cause"],
         "is_noise": state.get("is_noise", False),
         "validity_score": state.get("validity_score", 0.0),
+        "confidence_band": state.get("confidence_band", ""),
     }
     if state.get("evidence_entries"):
         out["tool_calls"] = state["evidence_entries"]

diff --git a/app/delivery/publish_findings/formatters/report.py b/app/delivery/publish_findings/formatters/report.py
@@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str:
     if top_log:
         conclusion_block += f"`{top_log}`\n"
 
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n"
+
     validated_lines, non_validated_lines = _render_claim_lines(ctx)
     if validated_lines:
         # Use a larger markdown heading so that "Findings" stands out as a section.
@@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str:
             "\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n"
         )
 
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        conclusion_block += (
+            "\n*Alternative hypotheses:*\n"
+            + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+            + "\n"
+        )
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        conclusion_block += (
+            "\n*Missing evidence:*\n"
+            + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+            + "\n"
+        )
+
     correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
     if correlation_signal_lines or correlation_driver_lines:
         conclusion_block += "\n## Upstream Correlation\n"
@@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str:
             rc += "\n<code>" + html.escape(top_log) + "</code>"
         parts.append(rc)
 
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        parts.append(f"<b>Confidence:</b> {html.escape(confidence_band.upper())}{pct}")
+
     validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx)
     if validated_lines:
         parts.append("<b>Findings</b>\n" + "\n".join(validated_lines))
     if non_validated_lines:
         parts.append("<b>Non-Validated Claims (Inferred)</b>\n" + "\n".join(non_validated_lines))
 
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        hyp = "\n".join(
+            "• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses
+        )
+        parts.append("<b>Alternative hypotheses</b>\n" + hyp)
+
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        me = "\n".join(
+            "• " + _to_telegram_html_body(_sanitize_for_slack(str(e)))
+            for e in missing_evidence_list
+        )
+        parts.append("<b>Missing evidence</b>\n" + me)
+
     provenance_lines = _format_provenance_lines(ctx)
     if provenance_lines:
         prov = "\n".join(
@@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None:
         rc_text += f"\n`{top_log}`"
     _add(_mrkdwn_section(rc_text))
 
+    # ── Confidence band ──
+    confidence_band = ctx.get("confidence_band", "")
+    validity_score_val = ctx.get("validity_score")
+    if confidence_band:
+        pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
+        _add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}"))
+
     # ── Failed Pods ──
     datadog_site = ctx.get("datadog_site", "datadoghq.com")
     all_pods = get_failed_pods(ctx)
@@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None:
     if non_validated_lines:
         _add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines)))
 
+    # ── Alternative Hypotheses ──
+    ranked_hypotheses = ctx.get("ranked_hypotheses") or []
+    if ranked_hypotheses:
+        blocks.append({"type": "divider"})
+        _add(
+            _mrkdwn_section(
+                "*Alternative hypotheses:*\n"
+                + "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+            )
+        )
+
+    # ── Missing Evidence ──
+    missing_evidence_list = ctx.get("missing_evidence") or []
+    if missing_evidence_list:
+        blocks.append({"type": "divider"})
+        _add(
+            _mrkdwn_section(
+                "*Missing evidence:*\n"
+                + "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+            )
+        )
+
     correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
     if correlation_signal_lines or correlation_driver_lines:
         blocks.append({"type": "divider"})

diff --git a/app/delivery/publish_findings/node.py b/app/delivery/publish_findings/node.py
@@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict:
 
     all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
     all_blocks = masking_ctx.unmask_value(all_blocks)
-    render_report(slack_message, root_cause_category=state.get("root_cause_category"))
+    render_report(
+        slack_message,
+        root_cause_category=state.get("root_cause_category"),
+        confidence_band=state.get("confidence_band", ""),
+        validity_score=state.get("validity_score"),
+        ranked_hypotheses=state.get("ranked_hypotheses", []),
+        missing_evidence=state.get("missing_evidence", []),
+    )
     open_in_editor(slack_message)
 
     slack_ctx = state.get("slack_context", {})