From c3de44f779741296416997a8655714cd515a06d5 Mon Sep 17 00:00:00 2001
From: Nicolas Herment <nherment@arkhaios.net>
Date: Mon, 20 Jan 2025 16:02:22 +0100
Subject: [PATCH] feat: cleanup structured output code

---
 holmes/core/investigation.py                  |   4 +-
 holmes/core/investigation_output_format.py    |  43 ++++++
 holmes/core/models.py                         |   1 +
 holmes/core/structured_output.py              |   2 -
 holmes/core/tool_calling_llm.py               | 134 +-----------------
 .../prompts/generic_investigation copy.jinja2 |  41 ------
 .../prompts/generic_investigation.jinja2      |   2 +-
 tests/llm/test_investigate.py                 |   9 +-
 8 files changed, 60 insertions(+), 176 deletions(-)
 create mode 100644 holmes/core/investigation_output_format.py
 delete mode 100644 holmes/plugins/prompts/generic_investigation copy.jinja2

diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py
index efb3b0fb..5d2d7366 100644
--- a/holmes/core/investigation.py
+++ b/holmes/core/investigation.py
@@ -8,7 +8,7 @@
 from holmes.utils.robusta import load_robusta_api_key
 
 
-def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console):
+def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console) -> InvestigationResult:
     load_robusta_api_key(dal=dal, config=config)
     context = dal.get_issue_data(
         investigate_request.context.get("robusta_issue_id")
@@ -42,9 +42,9 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
         instructions=resource_instructions,
         global_instructions=global_instructions
     )
-
     return InvestigationResult(
         analysis=investigation.result,
+        sections=investigation.sections,
         tool_calls=investigation.tool_calls or [],
         instructions=investigation.instructions,
     )
diff --git a/holmes/core/investigation_output_format.py b/holmes/core/investigation_output_format.py
new file mode 100644
index 00000000..6454a121
--- /dev/null
+++ b/holmes/core/investigation_output_format.py
@@ -0,0 +1,43 @@
+from typing import Any
+
+schema = {
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "required": [
+    "Alert Explanation",
+    "Investigation",
+    "Conclusions and Possible Root causes",
+    "Next Steps"
+  ],
+  "properties": {
+    "Alert Explanation": {
+      "type": ["string", "null"],
+      "description": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about"
+    },
+    "Investigation": {
+      "type": ["string", "null"],
+      "description": "what you checked and found"
+    },
+    "Conclusions and Possible Root causes": {
+      "type": ["string", "null"],
+      "description": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains"
+    },
+    "Next Steps": {
+      "type": ["string", "null"],
+      "description": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)"
+    }
+  },
+  "additionalProperties": False
+}
+
+ExpectedInvestigationOutputFormat = { "type": "json_schema", "json_schema": { "name": "InvestigationResult", "schema": schema, "strict": True} }
+
+def combine_sections(sections: Any) -> str:
+    if isinstance(sections, dict):
+        content = ''
+        for section_title, section_content in sections.items():
+            if section_content:
+                # content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
+                content = content + f'\n# {section_title}\n{section_content}\n'
+        return content
+    return f"{sections}"
diff --git a/holmes/core/models.py b/holmes/core/models.py
index 553e5812..ca3e5eb2 100644
--- a/holmes/core/models.py
+++ b/holmes/core/models.py
@@ -6,6 +6,7 @@
 
 class InvestigationResult(BaseModel):
     analysis: Optional[str] = None
+    sections: Optional[Dict[str, str]] = None
     tool_calls: List[ToolCallResult] = []
     instructions: List[str] = []
 
diff --git a/holmes/core/structured_output.py b/holmes/core/structured_output.py
index e0145eb4..23eaa990 100644
--- a/holmes/core/structured_output.py
+++ b/holmes/core/structured_output.py
@@ -23,8 +23,6 @@ class StructuredResponse(LLMResult):
 
 def generate_structured_output(llm_result:LLMResult) -> StructuredResponse:
 
-    print("")
-
     return StructuredResponse(
         **llm_result.model_dump(),
         sections=[],
diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
index a73ffc98..7c59da70 100644
--- a/holmes/core/tool_calling_llm.py
+++ b/holmes/core/tool_calling_llm.py
@@ -8,6 +8,7 @@
 from typing import List, Optional
 from holmes.core.llm import LLM
 from holmes.plugins.prompts import load_and_render_prompt
+from holmes.core.investigation_output_format import ExpectedInvestigationOutputFormat, combine_sections
 from openai import BadRequestError
 from openai._types import NOT_GIVEN
 from openai.types.chat.chat_completion_message_tool_call import (
@@ -61,44 +62,6 @@ class ResourceInstructions(BaseModel):
     instructions: List[str] = []
     documents: List[ResourceInstructionDocument] = []
 
-class ExpectedOutputFormat(BaseModel):
-    alert_explanation: Union[str, None]
-    investigation: Union[str, None]
-    conclusions_and_possible_root_causes: Union[str, None]
-    next_steps: Union[str, None]
-
-schema = {
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "type": "object",
-  "required": [
-    "Alert Explanation",
-    "Investigation",
-    "Conclusions and Possible Root causes",
-    "Next Steps"
-  ],
-  "properties": {
-    "Alert Explanation": {
-      "type": ["string", "null"],
-      "description": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about"
-    },
-    "Investigation": {
-      "type": ["string", "null"],
-      "description": "what you checked and found"
-    },
-    "Conclusions and Possible Root causes": {
-      "type": ["string", "null"],
-      "description": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains"
-    },
-    "Next Steps": {
-      "type": ["string", "null"],
-      "description": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)"
-    }
-  },
-  "additionalProperties": False
-}
-
-response_format = { "type": "json_schema", "json_schema": schema , "strict": False }
-
 class ToolCallingLLM:
 
     llm: LLM
@@ -156,7 +119,7 @@ def call(
                 logging.warning("Token limit exceeded. Truncating tool responses.")
                 messages = self.truncate_messages_to_fit_context(
                     messages, max_context_size, maximum_output_token
-                )
+                )combine_sections
 
             logging.debug(f"sending messages={messages}\n\ntools={tools}")
             try:
@@ -199,7 +162,7 @@ def call(
                     pass
             if not isinstance(text_response, str):
                 sections = text_response
-                text_response = stringify_sections(sections)
+                text_response = combine_sections(sections)
 
             if not tools_to_call:
                 # For chatty models post process and summarize the result
@@ -275,7 +238,7 @@ def _invoke_tool(
 
         tool_response = tool.invoke(tool_params)
 
-        return ToolCallResult(
+        return ToolCallResult(combine_sections
             tool_call_id=tool_call_id,
             tool_name=tool_name,
             description=tool.get_parameterized_one_liner(tool_params),
@@ -426,93 +389,6 @@ def investigate(
         )
         logging.debug("Rendered user prompt:\n%s", textwrap.indent(user_prompt, "    "))
 
-        res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt, response_format=ExpectedOutputFormat)
-        print(res)
-        print("******")
+        res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt, response_format=ExpectedInvestigationOutputFormat)
         res.instructions = runbooks
-        generate_structured_output(res, llm=self.llm)
         return res
-
-## ## ## ## ## ## START CUSTOM STRUCTURED RESPONSE
-
-class StructuredSection(BaseModel):
-    title: str
-    content: Union[str, None]
-    contains_meaningful_information: bool
-
-class StructuredResponse(BaseModel):
-    sections: List[StructuredSection]
-
-# class StructuredLLMResult(LLMResult):
-#     sections: List[StructuredSection]
-
-
-EXPECTED_SECTIONS = [
-    "investigation steps",
-    "conclusions and possible root causes",
-    "related logs",
-    "alert explanation",
-    "next steps"
-]
-
-PROMPT = f"""
-Your job as a LLM is to take the unstructured output from another LLM
-and structure it into sections. Keep the original wording and do not
-add any information that is not already there.
-
-Return a JSON with the section title, its content and . Each section content should
-be markdown formatted text. If you consider a section as empty, set
-its corresponding value to null.
-
-For example:
-
-[{{
-  "title": "investigation steps",
-  "text": "The pod `kafka-consumer` is in a `Failed` state with the container terminated for an unknown reason and an exit code of 255.",
-  "contains_meaningful_information": true
-}}, {{
-  "title": "conclusions and possible root causes",
-  "text": "...",
-  "contains_meaningful_information": true
-}}, {{
-  "title": "next steps",
-  "text": null,
-  "contains_meaningful_information": false
-}}, ...]
-
-The section titles are [{", ".join(EXPECTED_SECTIONS)}]
-"""
-
-def stringify_sections(sections: Any) -> str:
-    if isinstance(sections, dict):
-        content = ''
-        for section_title, section_content in sections.items():
-            content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
-        return content
-    return f"{sections}"
-
-def generate_structured_output(llm_result:LLMResult, llm:LLM) -> LLMResult:
-    if not llm_result.result:
-        return LLMResult(
-            **llm_result.model_dump()
-        )
-
-    messages = [
-        {"role": "system", "content": PROMPT},
-        {"role": "user", "content": llm_result.result},
-    ]
-
-    r = llm.completion(
-        model_override="gpt-4o-mini",
-        messages=messages,
-        temperature=0.00000001,
-        response_format=StructuredResponse,
-        drop_params=True,
-    )
-    # r_json = r.to_json()
-    # result = StructuredLLMResult.model_validate_json(r.choices[0].message.content)
-    print(r)
-    llm_result.sections = {}
-    return llm_result
-
-## ## ## ## ## ## END CUSTOM STRUCTURED RESPONSE
diff --git a/holmes/plugins/prompts/generic_investigation copy.jinja2 b/holmes/plugins/prompts/generic_investigation copy.jinja2
deleted file mode 100644
index e078c9d6..00000000
--- a/holmes/plugins/prompts/generic_investigation copy.jinja2	
+++ /dev/null
@@ -1,41 +0,0 @@
-You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.
-Whenever possible you MUST first use tools to investigate then answer the question.
-Do not say 'based on the tool output'
-
-Provide an terse analysis of the following {{ issue.source_type }} alert/issue and why it is firing.
-
-If the user provides you with extra instructions in a triple quotes section, ALWAYS perform their instructions and then perform your investigation.
-
-
-Global Instructions
-You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user request aligns with one of the described methods or conditions.
-Use these rules when deciding how to apply them:
-
-* If the user prompt includes Global Instructions, treat them as a reference resource.
-* Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current request or the instructions in a triple quotes section reference one of these tasks, follow the Global Instruction for that task.
-* Some Global Instructions may define general conditions that always apply if a certain scenario occurs (e.g., "whenever investigating a memory issue, always check resource limits"). If such a condition matches the current situation, apply the Global Instruction accordingly.
-* If user's prompt or the instructions in a triple quotes section direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, follow the Global Instructions on how to perform it.
-* If multiple Global Instructions are relevant, apply all that fit.
-* If no Global Instruction is relevant, or no condition applies, ignore them and proceed as normal.
-* Before finalizing your answer double-check if any Global Instructions apply. If so, ensure you have correctly followed those instructions.
-
-{% include '_general_instructions.jinja2' %}
-
-Style Guide:
-* `code block` exact names of IT/cloud resources like specific virtual machines.
-* *Surround the title of the root cause like this*.
-* Whenever there are precise numbers in the data available, quote them. For example:
-* Don't say an app is repeatedly crashing, rather say the app has crashed X times so far
-* Don't just say x/y nodes don't match a pod's affinity selector, rather say x/y nodes don't match the selector ABC
-* Don't say "The alert indicates a warning event related to a Kubernetes pod failing to start due to a container creation error" rather say "The pod <pod name> failed to start due to a container creation error."
-* And so on
-* But only quote relevant numbers or metrics that are available. Do not guess.
-* Remove unnecessary words
-
-Give your answer in a JSON format with the following sections. The content of each section should be formatted with markdown:
-
-- Alert Explanation: <1-2 sentences explaining the alert itself - note don't say "The alert indicates a warning event related to a Kubernetes pod doing blah" rather just say "The pod XYZ did blah" because that is what the user actually cares about>
-- Investigation: <what you checked and found>
-- Conclusions and Possible Root causes: <what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains>
-- Next Steps: <what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)>
-- <DO NOT list tools used and DO NOT add a `# Tools` section>
diff --git a/holmes/plugins/prompts/generic_investigation.jinja2 b/holmes/plugins/prompts/generic_investigation.jinja2
index e078c9d6..762189c2 100644
--- a/holmes/plugins/prompts/generic_investigation.jinja2
+++ b/holmes/plugins/prompts/generic_investigation.jinja2
@@ -32,7 +32,7 @@ Style Guide:
 * But only quote relevant numbers or metrics that are available. Do not guess.
 * Remove unnecessary words
 
-Give your answer in a JSON format with the following sections. The content of each section should be formatted with markdown:
+Give your answer in a JSON format with the following sections. You can skip a section if it's not relevant to the investigation. The content of each section should be formatted with markdown:
 
 - Alert Explanation: <1-2 sentences explaining the alert itself - note don't say "The alert indicates a warning event related to a Kubernetes pod doing blah" rather just say "The pod XYZ did blah" because that is what the user actually cares about>
 - Investigation: <what you checked and found>
diff --git a/tests/llm/test_investigate.py b/tests/llm/test_investigate.py
index 7f0a3703..d8d112cb 100644
--- a/tests/llm/test_investigate.py
+++ b/tests/llm/test_investigate.py
@@ -33,7 +33,7 @@ def __init__(self, test_case:InvestigateTestCase):
         self._test_case = test_case
 
     def create_tool_executor(
-        self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
+        self, console: Console, dal:Optional[SupabaseDal]
     ) -> ToolExecutor:
 
         mock = MockToolsets(generate_mocks=self._test_case.generate_mocks, test_case_folder=self._test_case.folder)
@@ -127,6 +127,13 @@ def test_investigate(experiment_name, test_case):
     print(f"** OUTPUT **\n{output}")
     print(f"** SCORES **\n{scores}")
 
+    assert result.sections
+    assert len(result.sections) >= 4
+    assert result.sections.get("Alert Explanation")
+    assert result.sections.get("Investigation")
+    assert result.sections.get("Conclusions and Possible Root causes")
+    assert result.sections.get("Next Steps")
+
     if scores.get("faithfulness"):
         assert scores.get("faithfulness") >= test_case.evaluation.faithfulness