Add llm evaluations (#204)

This PR has mostly 3 changes: - Improvement of the prompt to prevent the LLM mentioning separately `kubectl logs` and `kubectl logs --previous` - Introduction of a test suite for investigations, including the ability to mock the DB access - Integration with brantrust,dev although that code is currently commented out as I need to talk to them as I hit the freemium limits. It's a big PR and most of it is the refactor of the existing mock mechanism to work well with the DAL.
robusta-dev · Nov 25, 2024 · 57951e1 · 57951e1
1 parent 3383b2f
commit 57951e1
Show file tree

Hide file tree

Showing 153 changed files with 3,622 additions and 1,901 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,5 @@ cython_debug/
 playwright.png
 .deepeval*
 pyrightconfig.json
+
+*.AUTOGENERATED
diff --git a/holmes/config.py b/holmes/config.py
@@ -10,6 +10,7 @@
 from pydash.arrays import concat
 from rich.console import Console
 
+
 from holmes.core.runbooks import RunbookManager
 from holmes.core.supabase_dal import SupabaseDal
 from holmes.core.tool_calling_llm import (IssueInvestigator, ToolCallingLLM,
@@ -102,7 +103,7 @@ def load_from_env(cls):
                 kwargs[field_name] = val
         return cls(**kwargs)
 
-    def _create_tool_executor(
+    def create_tool_executor(
         self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal]
     ) -> ToolExecutor:
         all_toolsets = load_builtin_toolsets(dal=dal)
@@ -146,7 +147,7 @@ def _create_tool_executor(
     def create_toolcalling_llm(
         self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None
     ) -> ToolCallingLLM:
-        tool_executor = self._create_tool_executor(console, allowed_toolsets, dal)
+        tool_executor = self.create_tool_executor(console, allowed_toolsets, dal)
         return ToolCallingLLM(
             tool_executor,
             self.max_steps,
@@ -164,7 +165,7 @@ def create_issue_investigator(
             all_runbooks.extend(load_runbooks_from_file(runbook_path))
 
         runbook_manager = RunbookManager(all_runbooks)
-        tool_executor = self._create_tool_executor(console, allowed_toolsets, dal)
+        tool_executor = self.create_tool_executor(console, allowed_toolsets, dal)
         return IssueInvestigator(
             tool_executor,
             runbook_manager,

diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py
@@ -0,0 +1,47 @@
+
+from rich.console import Console
+from holmes.common.env_vars import ALLOWED_TOOLSETS, HOLMES_POST_PROCESSING_PROMPT
+from holmes.config import Config
+from holmes.core.issue import Issue
+from holmes.core.models import InvestigateRequest, InvestigationResult
+from holmes.core.supabase_dal import SupabaseDal
+from holmes.utils.robusta import load_robusta_api_key
+
+
+def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console):
+    load_robusta_api_key(dal=dal, config=config)
+    context = dal.get_issue_data(
+        investigate_request.context.get("robusta_issue_id")
+    )
+
+    resource_instructions = dal.get_resource_instructions(
+        "alert", investigate_request.context.get("issue_type")
+    )
+    raw_data = investigate_request.model_dump()
+    if context:
+        raw_data["extra_context"] = context
+
+    ai = config.create_issue_investigator(
+        console, allowed_toolsets=ALLOWED_TOOLSETS, dal=dal
+    )
+    issue = Issue(
+        id=context["id"] if context else "",
+        name=investigate_request.title,
+        source_type=investigate_request.source,
+        source_instance_id=investigate_request.source_instance_id,
+        raw=raw_data,
+    )
+
+    investigation = ai.investigate(
+        issue,
+        prompt=investigate_request.prompt_template,
+        console=console,
+        post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
+        instructions=resource_instructions,
+    )
+
+    return InvestigationResult(
+        analysis=investigation.result,
+        tool_calls=investigation.tool_calls or [],
+        instructions=investigation.instructions,
+    )
diff --git a/holmes/core/supabase_dal.py b/holmes/core/supabase_dal.py
@@ -3,7 +3,7 @@
 import logging
 import os
 import threading
-from typing import Dict, Optional, List
+from typing import Dict, Optional, List, Tuple
 from uuid import uuid4
 
 import yaml
@@ -31,7 +31,6 @@
 class RobustaConfig(BaseModel):
     sinks_config: List[Dict[str, Dict]]
 
-
 class RobustaToken(BaseModel):
     store_url: str
     api_key: str
@@ -127,10 +126,11 @@ def sign_in(self) -> str:
         self.client.postgrest.auth(res.session.access_token)
         return res.user.id
 
-    def get_issue_data(self, issue_id: str) -> Optional[Dict]:
+    def get_issue_data(self, issue_id: Optional[str]) -> Optional[Dict]:
         # TODO this could be done in a single atomic SELECT, but there is no
         # foreign key relation between Issues and Evidence.
-
+        if not issue_id:
+            return None
         if not self.enabled:  # store not initialized
             return None
         issue_data = None
@@ -145,7 +145,7 @@ def get_issue_data(self, issue_id: str) -> Optional[Dict]:
             if len(issue_response.data):
                 issue_data = issue_response.data[0]
 
-        except:  # e.g. invalid id format
+        except Exception:  # e.g. invalid id format
             logging.exception("Supabase error while retrieving issue data")
             return None
         if not issue_data:
@@ -205,7 +205,7 @@ def create_session_token(self) -> str:
         ).execute()
         return token
 
-    def get_ai_credentials(self) -> (str, str):
+    def get_ai_credentials(self) -> Tuple[str, str]:
         with self.lock:
             session_token = self.token_cache.get("session_token")
             if not session_token:

diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import textwrap
-import os
 from typing import List, Optional, Dict
 from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.plugins.prompts import load_and_render_prompt
@@ -333,6 +332,7 @@ def investigate(
         post_processing_prompt: Optional[str] = None,
     ) -> LLMResult:
         runbooks = self.runbook_manager.get_instructions_for_issue(issue)
+
         if instructions != None and instructions.instructions:
             runbooks.extend(instructions.instructions)
 

diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -18,13 +18,14 @@ If investigating Kubernetes problems:
 * run as many kubectl commands as you need to gather more information, then respond.
 * if possible, do so repeatedly on different Kubernetes objects.
 * for example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.
-* when investigating a pod that crashed or application errors, always run kubectl_describe and fetch logs with both kubectl_previous_logs and kubectl_logs so that you see current logs and any logs from before a crash.
+* use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
+* when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs
 * do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
 * do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
 * if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
 * if the user says something isn't working, ALWAYS:
 ** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
-** check the application aspects with kubectl_logs + kubectl_previous_logs and other relevant tools
+** check the application aspects through the logs (kubectl_logs and kubectl_previous_logs) and other relevant tools
 ** look for misconfigured ingresses/services etc
 
 Special cases and how to reply: