Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion app/agent/investigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from app.agent.llm_invoke_errors import LLMInvokeFailure, classify_llm_invoke_failure
from app.agent.prompt import build_system_prompt, format_alert_context
from app.agent.result import InvestigationResult, parse_diagnosis
from app.agent.result import InvestigationResult, check_sufficiency, parse_diagnosis
from app.cli.support.output import debug_print, get_tracker
from app.constants.investigation import MAX_INVESTIGATION_LOOPS
from app.services.agent_llm_client import ToolCall, get_agent_llm
Expand Down Expand Up @@ -237,12 +237,23 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
result.evidence_entries = [e.model_dump() for e in evidence_entries]
result.agent_messages = messages

if check_sufficiency(result):
if result.root_cause.startswith("Most likely:"):
result.root_cause = result.root_cause[len("Most likely:") :].lstrip()
else:
if not result.root_cause.startswith("Most likely"):
result.root_cause = f"Most likely: {result.root_cause}"
# Override band set by classify_confidence_band in result.py: thin evidence means LOW
# regardless of LLM-reported score (a high score with zero claims is still insufficient).
result.confidence_band = "low"

_emit(
"agent_end",
{
"root_cause": result.root_cause,
"validity_score": result.validity_score,
"root_cause_category": result.root_cause_category,
"confidence_band": result.confidence_band,
},
)

Expand Down Expand Up @@ -282,6 +293,9 @@ def _degraded_investigation_from_llm_failure(
"root_cause": error_msg,
"validity_score": 0.0,
"root_cause_category": failure.root_cause_category,
"confidence_band": "low",
"ranked_hypotheses": [],
"missing_evidence": [],
},
)
updates = {
Expand All @@ -292,6 +306,9 @@ def _degraded_investigation_from_llm_failure(
"non_validated_claims": [],
"remediation_steps": failure.remediation_steps,
"validity_score": 0.0,
"confidence_band": "low",
"ranked_hypotheses": [],
"missing_evidence": [],
"investigation_recommendations": [],
"evidence": evidence,
"evidence_entries": [e.model_dump() for e in evidence_entries],
Expand Down Expand Up @@ -647,6 +664,9 @@ def _result_to_state(result: InvestigationResult) -> dict[str, Any]:
"non_validated_claims": result.non_validated_claims,
"remediation_steps": result.remediation_steps,
"validity_score": result.validity_score,
"confidence_band": result.confidence_band,
"ranked_hypotheses": result.ranked_hypotheses,
"missing_evidence": result.missing_evidence,
"investigation_recommendations": result.investigation_recommendations,
"evidence": result.evidence,
"evidence_entries": result.evidence_entries,
Expand Down
4 changes: 4 additions & 0 deletions app/agent/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
- **Non-validated claims**: Hypotheses you could not confirm
- **Remediation steps**: Ordered, concrete actions to fix the issue
- **Validity score**: 0.0–1.0 reflecting your confidence based on evidence quality
- **Confidence band**: `high` (strong evidence from multiple sources), `medium` (partial evidence, some gaps), or `low` (thin or conflicting evidence)
- **Ranked hypotheses**: If confidence is medium or low, list alternative explanations in order of likelihood (most likely first)
- **Missing evidence**: List specific data sources or queries that would confirm or refute the diagnosis but were unavailable
- If evidence is thin or conflicting, begin your root cause statement with "Most likely: " to signal uncertainty
"""

_ALERT_CONTEXT_TEMPLATE = """## Alert
Expand Down
67 changes: 62 additions & 5 deletions app/agent/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging
from dataclasses import dataclass, field
from typing import Any, TypedDict, cast
from typing import Any, Literal, TypedDict, cast

from pydantic import BaseModel, Field

Expand All @@ -17,6 +17,17 @@
logger = logging.getLogger(__name__)


class _ValidatedClaimSchema(BaseModel):
claim: str = Field(description="The validated claim statement")
evidence_sources: list[str] = Field(
default_factory=list,
description=(
"Subset of the collected evidence keys that directly support this specific claim. "
"Only include keys that actually informed this claim."
),
)


@dataclass
class InvestigationResult:
root_cause: str
Expand All @@ -26,6 +37,9 @@ class InvestigationResult:
non_validated_claims: list[dict] = field(default_factory=list)
remediation_steps: list[str] = field(default_factory=list)
validity_score: float = 0.0
confidence_band: str = ""
ranked_hypotheses: list[str] = field(default_factory=list)
missing_evidence: list[str] = field(default_factory=list)
evidence: dict[str, Any] = field(default_factory=dict)
evidence_entries: list[dict] = field(default_factory=list)
agent_messages: list[dict] = field(default_factory=list)
Expand All @@ -37,6 +51,7 @@ def unknown(cls, alert_name: str = "Unknown alert") -> InvestigationResult:
root_cause=f"{alert_name}: Unable to determine root cause — insufficient evidence.",
root_cause_category="unknown",
validity_score=0.0,
confidence_band="low",
non_validated_claims=[
{
"claim": "Insufficient evidence available",
Expand All @@ -51,9 +66,26 @@ def noise(cls) -> InvestigationResult:
root_cause="Message classified as noise — no investigation needed.",
root_cause_category="healthy",
validity_score=1.0,
confidence_band="high",
)


def classify_confidence_band(score: float) -> Literal["high", "medium", "low"]:
if score >= 0.75:
return "high"
if score >= 0.40:
return "medium"
return "low"


def check_sufficiency(result: InvestigationResult) -> bool:
if result.root_cause_category in {"healthy", "unknown"}:
return True
if result.validity_score >= 0.75 and len(result.validated_claims) >= 1:
return True
return result.validity_score >= 0.40 and len(result.validated_claims) >= 2
Comment thread
greptile-apps[bot] marked this conversation as resolved.


def parse_diagnosis(
messages: list[dict[str, Any]],
evidence: dict[str, Any],
Expand Down Expand Up @@ -121,8 +153,9 @@ class DiagnosisSchema(BaseModel):
causal_chain: list[str] = Field(
default_factory=list, description="Ordered steps leading to the failure"
)
validated_claims: list[str] = Field(
default_factory=list, description="Claims supported by tool evidence"
validated_claims: list[_ValidatedClaimSchema] = Field(
default_factory=list,
description="Claims supported by tool evidence, each with their specific supporting evidence keys",
)
non_validated_claims: list[str] = Field(
default_factory=list, description="Claims not yet confirmed by evidence"
Expand All @@ -133,6 +166,14 @@ class DiagnosisSchema(BaseModel):
validity_score: float = Field(
default=0.0, description="0.0–1.0 confidence in the diagnosis"
)
ranked_hypotheses: list[str] = Field(
default_factory=list,
description="Alternative hypotheses ranked by likelihood (most to least likely)",
)
missing_evidence: list[str] = Field(
default_factory=list,
description="Evidence that would confirm or refute the diagnosis but was unavailable",
)

return DiagnosisSchema

Expand All @@ -157,10 +198,12 @@ class _DiagnosisPayload(TypedDict):
root_cause: str
root_cause_category: str
causal_chain: list[str]
validated_claims: list[str]
validated_claims: list[dict]
non_validated_claims: list[str]
remediation_steps: list[str]
validity_score: float
ranked_hypotheses: list[str]
missing_evidence: list[str]

llm = get_llm_for_reasoning()
schema_model = _build_diagnosis_schema(_taxonomy_categories_for_alert_source(alert_source))
Expand All @@ -181,10 +224,23 @@ def _to_claim_dicts(claims: list[str], status: str) -> list[dict]:
root_cause=schema["root_cause"],
root_cause_category=schema["root_cause_category"],
causal_chain=schema["causal_chain"],
validated_claims=_to_claim_dicts(schema["validated_claims"], "validated"),
validated_claims=[
{
"claim": c["claim"],
"validation_status": "validated",
**(
{"evidence_sources": c["evidence_sources"]} if c.get("evidence_sources") else {}
),
}
for c in schema["validated_claims"]
if c.get("claim")
],
non_validated_claims=_to_claim_dicts(schema["non_validated_claims"], "not_validated"),
remediation_steps=schema["remediation_steps"],
validity_score=schema["validity_score"],
confidence_band=classify_confidence_band(schema["validity_score"]),
ranked_hypotheses=schema["ranked_hypotheses"],
missing_evidence=schema["missing_evidence"],
)


Expand All @@ -207,6 +263,7 @@ def _parse_via_legacy(
],
remediation_steps=rr.remediation_steps,
validity_score=0.5,
confidence_band=classify_confidence_band(0.5),
)
except Exception as err:
logger.warning("Legacy parse_root_cause also failed: %s", err)
Expand Down
1 change: 1 addition & 0 deletions app/cli/investigation/investigate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def run_investigation_cli(
"root_cause": state["root_cause"],
"is_noise": state.get("is_noise", False),
"validity_score": state.get("validity_score", 0.0),
"confidence_band": state.get("confidence_band", ""),
}
if state.get("evidence_entries"):
out["tool_calls"] = state["evidence_entries"]
Expand Down
71 changes: 71 additions & 0 deletions app/delivery/publish_findings/formatters/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,12 @@ def format_slack_message(ctx: ReportContext) -> str:
if top_log:
conclusion_block += f"`{top_log}`\n"

confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
conclusion_block += f"*Confidence:* {confidence_band.upper()}{pct}\n"

validated_lines, non_validated_lines = _render_claim_lines(ctx)
if validated_lines:
# Use a larger markdown heading so that "Findings" stands out as a section.
Expand All @@ -475,6 +481,21 @@ def format_slack_message(ctx: ReportContext) -> str:
"\n*Non-Validated Claims (Inferred):*\n" + "\n".join(non_validated_lines) + "\n"
)

ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
conclusion_block += (
"\n*Alternative hypotheses:*\n"
+ "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
+ "\n"
)
missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
conclusion_block += (
"\n*Missing evidence:*\n"
+ "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
+ "\n"
)

correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
conclusion_block += "\n## Upstream Correlation\n"
Expand Down Expand Up @@ -556,12 +577,33 @@ def format_telegram_message(ctx: ReportContext) -> str:
rc += "\n<code>" + html.escape(top_log) + "</code>"
parts.append(rc)

confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
parts.append(f"<b>Confidence:</b> {html.escape(confidence_band.upper())}{pct}")

validated_lines, non_validated_lines = _render_claim_lines_telegram(ctx)
if validated_lines:
parts.append("<b>Findings</b>\n" + "\n".join(validated_lines))
if non_validated_lines:
parts.append("<b>Non-Validated Claims (Inferred)</b>\n" + "\n".join(non_validated_lines))

ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
hyp = "\n".join(
"• " + _to_telegram_html_body(_sanitize_for_slack(str(h))) for h in ranked_hypotheses
)
parts.append("<b>Alternative hypotheses</b>\n" + hyp)

missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
me = "\n".join(
"• " + _to_telegram_html_body(_sanitize_for_slack(str(e)))
for e in missing_evidence_list
)
parts.append("<b>Missing evidence</b>\n" + me)

provenance_lines = _format_provenance_lines(ctx)
if provenance_lines:
prov = "\n".join(
Expand Down Expand Up @@ -695,6 +737,13 @@ def _add(block: "dict[str, Any] | None") -> None:
rc_text += f"\n`{top_log}`"
_add(_mrkdwn_section(rc_text))

# ── Confidence band ──
confidence_band = ctx.get("confidence_band", "")
validity_score_val = ctx.get("validity_score")
if confidence_band:
pct = f" ({validity_score_val:.0%})" if isinstance(validity_score_val, (int, float)) else ""
_add(_mrkdwn_section(f"*Confidence:* {confidence_band.upper()}{pct}"))

# ── Failed Pods ──
datadog_site = ctx.get("datadog_site", "datadoghq.com")
all_pods = get_failed_pods(ctx)
Expand Down Expand Up @@ -727,6 +776,28 @@ def _add(block: "dict[str, Any] | None") -> None:
if non_validated_lines:
_add(_mrkdwn_section("*Inferred (not yet validated)*\n" + "\n".join(non_validated_lines)))

# ── Alternative Hypotheses ──
ranked_hypotheses = ctx.get("ranked_hypotheses") or []
if ranked_hypotheses:
blocks.append({"type": "divider"})
_add(
_mrkdwn_section(
"*Alternative hypotheses:*\n"
+ "\n".join(f"• {_sanitize_for_slack(h)}" for h in ranked_hypotheses)
)
)

# ── Missing Evidence ──
missing_evidence_list = ctx.get("missing_evidence") or []
if missing_evidence_list:
blocks.append({"type": "divider"})
_add(
_mrkdwn_section(
"*Missing evidence:*\n"
+ "\n".join(f"• {_sanitize_for_slack(e)}" for e in missing_evidence_list)
)
)
Comment thread
greptile-apps[bot] marked this conversation as resolved.

correlation_signal_lines, correlation_driver_lines = _format_correlation_lines(ctx)
if correlation_signal_lines or correlation_driver_lines:
blocks.append({"type": "divider"})
Expand Down
9 changes: 8 additions & 1 deletion app/delivery/publish_findings/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,14 @@ def generate_report(state: InvestigationState) -> dict:

all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
all_blocks = masking_ctx.unmask_value(all_blocks)
render_report(slack_message, root_cause_category=state.get("root_cause_category"))
render_report(
slack_message,
root_cause_category=state.get("root_cause_category"),
confidence_band=state.get("confidence_band", ""),
validity_score=state.get("validity_score"),
ranked_hypotheses=state.get("ranked_hypotheses", []),
missing_evidence=state.get("missing_evidence", []),
)
open_in_editor(slack_message)

slack_ctx = state.get("slack_context", {})
Expand Down
Loading