From 310f5c25b904faba2cd9497764224537ec69f45d Mon Sep 17 00:00:00 2001 From: Nicolas Herment Date: Mon, 13 Jan 2025 10:28:33 +0100 Subject: [PATCH] wip --- examples/custom_llm.py | 1 - holmes/config.py | 44 ++++++---- holmes/core/investigation.py | 15 +++- holmes/core/llm.py | 63 +++++++++++++- holmes/core/perf_timing.py | 48 +++++++++++ holmes/core/supabase_dal.py | 17 ++-- holmes/core/tool_calling_llm.py | 48 +++++++---- holmes/plugins/toolsets/__init__.py | 9 +- holmes/plugins/toolsets/findings.py | 4 +- holmes/plugins/toolsets/grafana_loki.py | 1 - holmes/plugins/toolsets/internet.py | 4 +- holmes/utils/global_instructions.py | 5 +- holmes/utils/holmes_sync_toolsets.py | 19 ++--- poetry.lock | 32 +++++++- pyproject.toml | 1 + server.py | 104 ++++++++++++++++++++---- test-api_chat.sh | 2 + tests/llm/test_investigate.py | 2 +- tests/test_issue_investigator.py | 10 +-- 19 files changed, 341 insertions(+), 88 deletions(-) create mode 100644 holmes/core/perf_timing.py create mode 100755 test-api_chat.sh diff --git a/examples/custom_llm.py b/examples/custom_llm.py index f3ec0c06..194b9764 100644 --- a/examples/custom_llm.py +++ b/examples/custom_llm.py @@ -39,7 +39,6 @@ def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] def ask_holmes(): - console = Console() prompt = "what issues do I have in my cluster" diff --git a/holmes/config.py b/holmes/config.py index 79fb94c3..4023ddf1 100644 --- a/holmes/config.py +++ b/holmes/config.py @@ -8,7 +8,6 @@ from pydantic import FilePath, SecretStr, Field from pydash.arrays import concat -from rich.console import Console from holmes.core.runbooks import RunbookManager @@ -35,6 +34,7 @@ from holmes.core.tools import YAMLToolset from holmes.common.env_vars import ROBUSTA_CONFIG_PATH from holmes.utils.definitions import RobustaConfig +from holmes.core.perf_timing import PerfTiming DEFAULT_CONFIG_LOCATION = os.path.expanduser("~/.holmes/config.yaml") @@ -133,20 +133,18 @@ def __get_cluster_name() -> Optional[str]: return None def create_console_tool_executor( - self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] + self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] ) -> ToolExecutor: """ Creates ToolExecutor for the cli """ default_toolsets = [toolset for toolset in load_builtin_toolsets(dal, grafana_config=self.grafana) if any(tag in (ToolsetTag.CORE, ToolsetTag.CLI) for tag in toolset.tags)] - if allowed_toolsets == "*": matching_toolsets = default_toolsets else: matching_toolsets = get_matching_toolsets( default_toolsets, allowed_toolsets.split(",") ) - # Enable all matching toolsets that have CORE or CLI tag for toolset in matching_toolsets: toolset.enabled = True @@ -184,17 +182,20 @@ def create_console_tool_executor( return ToolExecutor(enabled_toolsets) def create_tool_executor( - self, console: Console, dal:Optional[SupabaseDal] + self, dal:Optional[SupabaseDal] ) -> ToolExecutor: + t = PerfTiming("create_tool_executor") """ Creates ToolExecutor for the server endpoints """ all_toolsets = load_builtin_toolsets(dal=dal, grafana_config=self.grafana) + t.measure("load_builtin_toolsets") if os.path.isfile(CUSTOM_TOOLSET_LOCATION): try: all_toolsets.extend(load_toolsets_from_file(CUSTOM_TOOLSET_LOCATION, silent_fail=True)) + t.measure(f"load_toolsets_from_file {CUSTOM_TOOLSET_LOCATION}") except Exception as error: logging.error(f"An error happened while trying to use custom toolset: {error}") @@ -203,12 +204,16 @@ def create_tool_executor( logging.debug( f"Starting AI session with tools: {[t.name for t in enabled_tools]}" ) - return ToolExecutor(enabled_toolsets) + t.measure("merge toolsets") + tool_executor = ToolExecutor(enabled_toolsets) + t.measure("instantiate ToolExecutor") + t.end() + return tool_executor def create_console_toolcalling_llm( - self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None + self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] = None ) -> ToolCallingLLM: - tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal) + tool_executor = self.create_console_tool_executor(allowed_toolsets, dal) return ToolCallingLLM( tool_executor, self.max_steps, @@ -216,9 +221,12 @@ def create_console_toolcalling_llm( ) def create_toolcalling_llm( - self, console: Console, dal:Optional[SupabaseDal] = None + self, dal:Optional[SupabaseDal] = None ) -> ToolCallingLLM: - tool_executor = self.create_tool_executor(console, dal) + t = PerfTiming("create_toolcalling_llm") + tool_executor = self.create_tool_executor(dal) + t.measure("create_tool_executor") + t.end() return ToolCallingLLM( tool_executor, self.max_steps, @@ -227,25 +235,31 @@ def create_toolcalling_llm( def create_issue_investigator( self, - console: Console, dal: Optional[SupabaseDal] = None ) -> IssueInvestigator: + t = PerfTiming("create_issue_investigator") all_runbooks = load_builtin_runbooks() + t.measure("load_builtin_runbooks") for runbook_path in self.custom_runbooks: all_runbooks.extend(load_runbooks_from_file(runbook_path)) + t.measure("custom_runbooks -> load_runbooks_from_file") runbook_manager = RunbookManager(all_runbooks) - tool_executor = self.create_tool_executor(console, dal) - return IssueInvestigator( + t.measure("RunbookManager()") + tool_executor = self.create_tool_executor(dal) + t.measure("create_tool_executor") + issue_investigator = IssueInvestigator( tool_executor, runbook_manager, self.max_steps, self._get_llm() ) + t.measure("IssueInvestigator()") + t.end() + return issue_investigator def create_console_issue_investigator( self, - console: Console, allowed_toolsets: ToolsetPattern, dal: Optional[SupabaseDal] = None ) -> IssueInvestigator: @@ -254,7 +268,7 @@ def create_console_issue_investigator( all_runbooks.extend(load_runbooks_from_file(runbook_path)) runbook_manager = RunbookManager(all_runbooks) - tool_executor = self.create_console_tool_executor(console, allowed_toolsets, dal) + tool_executor = self.create_console_tool_executor(allowed_toolsets, dal) return IssueInvestigator( tool_executor, runbook_manager, diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py index efb3b0fb..b462796e 100644 --- a/holmes/core/investigation.py +++ b/holmes/core/investigation.py @@ -1,4 +1,5 @@ +from typing import Optional from rich.console import Console from holmes.common.env_vars import HOLMES_POST_PROCESSING_PROMPT from holmes.config import Config @@ -6,26 +7,33 @@ from holmes.core.models import InvestigateRequest, InvestigationResult from holmes.core.supabase_dal import SupabaseDal from holmes.utils.robusta import load_robusta_api_key +from holmes.core.perf_timing import PerfTiming -def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Console): +def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config, console:Optional[Console] = None): + t = PerfTiming("investigate_issues") load_robusta_api_key(dal=dal, config=config) context = dal.get_issue_data( investigate_request.context.get("robusta_issue_id") ) + t.measure("get_issue_data") resource_instructions = dal.get_resource_instructions( "alert", investigate_request.context.get("issue_type") ) + t.measure("dal.get_resource_instructions") global_instructions = dal.get_global_instructions_for_account() + t.measure("dal.get_global_instructions_for_account") raw_data = investigate_request.model_dump() + t.measure("investigate_request.model_dump") if context: raw_data["extra_context"] = context ai = config.create_issue_investigator( - console, dal=dal + dal=dal ) + t.measure("config.create_issue_investigator") issue = Issue( id=context["id"] if context else "", name=investigate_request.title, @@ -42,7 +50,8 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal instructions=resource_instructions, global_instructions=global_instructions ) - + t.measure("ai.investigate") + t.end() return InvestigationResult( analysis=investigation.result, tool_calls=investigation.tool_calls or [], diff --git a/holmes/core/llm.py b/holmes/core/llm.py index aed6e007..565ba8d7 100644 --- a/holmes/core/llm.py +++ b/holmes/core/llm.py @@ -10,8 +10,17 @@ from pydantic import BaseModel import litellm import os +import sys +import json +import sys +from types import ModuleType, FunctionType +from gc import get_referents from holmes.common.env_vars import ROBUSTA_AI, ROBUSTA_API_ENDPOINT +from types import ModuleType, FunctionType +from gc import get_referents +from holmes.core.perf_timing import PerfTiming, log_function_timing + def environ_get_safe_int(env_var, default="0"): try: @@ -22,6 +31,10 @@ def environ_get_safe_int(env_var, default="0"): OVERRIDE_MAX_OUTPUT_TOKEN = environ_get_safe_int("OVERRIDE_MAX_OUTPUT_TOKEN") OVERRIDE_MAX_CONTENT_SIZE = environ_get_safe_int("OVERRIDE_MAX_CONTENT_SIZE") +cache = dict() +cache_hit = 0 +cache_miss = 0 + class LLM: @abstractmethod @@ -40,6 +53,32 @@ def count_tokens_for_message(self, messages: list[dict]) -> int: def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] = [], tool_choice: Optional[Union[str, dict]] = None, response_format: Optional[Union[dict, Type[BaseModel]]] = None, temperature:Optional[float] = None, drop_params: Optional[bool] = None) -> ModelResponse: pass +def hash_messages(messages:Any) -> int: + return hash(json.dumps(messages, sort_keys=True)) + + +# Custom objects know their class. +# Function objects seem to know way too much, including modules. +# Exclude modules as well. +BLACKLIST = type, ModuleType, FunctionType + + +def getsize(obj): + """sum size of object & members.""" + if isinstance(obj, BLACKLIST): + raise TypeError('getsize() does not take argument of type: '+ str(type(obj))) + seen_ids = set() + size = 0 + objects = [obj] + while objects: + need_referents = [] + for obj in objects: + if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids: + seen_ids.add(id(obj)) + size += sys.getsizeof(obj) + need_referents.append(obj) + objects = get_referents(*need_referents) + return size class DefaultLLM(LLM): @@ -100,12 +139,12 @@ def check_llm(self, model:str, api_key:Optional[str]): "https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces" ) else: - # + # api_key_env_var = f"{provider.upper()}_API_KEY" if api_key: os.environ[api_key_env_var] = api_key model_requirements = litellm.validate_environment(model=model) - + if not model_requirements["keys_in_environment"]: raise Exception(f"model {model} requires the following environment variables: {model_requirements['missing_keys']}") @@ -146,7 +185,24 @@ def count_tokens_for_message(self, messages: list[dict]) -> int: return litellm.token_counter(model=self.model, messages=messages) + @log_function_timing def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] = [], tool_choice: Optional[Union[str, dict]] = None, response_format: Optional[Union[dict, Type[BaseModel]]] = None, temperature:Optional[float] = None, drop_params: Optional[bool] = None) -> ModelResponse: + # hash_val = hash_messages(messages) + # global cache + # global cache_hit + # global cache_miss + # cache_value = None + # if hash_val in cache: + # cache_hit = cache_hit + 1 + # cache_value = cache.get(hash_val) + # else: + # cache_miss = cache_miss + 1 + + # print(f"(*)(*) cache hit rate = {round(cache_hit/(cache_hit+cache_miss)*100)}%. cvache size = {round(getsize(cache)/1024)}MB") + + # if cache_value: + # return cache_value + t = PerfTiming("llm.completion") result = litellm.completion( model=self.model, api_key=self.api_key, @@ -158,7 +214,8 @@ def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] response_format=response_format, drop_params=drop_params ) - + t.end() + # cache[hash_val] = result if isinstance(result, ModelResponse): return result else: diff --git a/holmes/core/perf_timing.py b/holmes/core/perf_timing.py new file mode 100644 index 00000000..2d13db37 --- /dev/null +++ b/holmes/core/perf_timing.py @@ -0,0 +1,48 @@ +import time +import logging +from contextlib import contextmanager + +from functools import wraps + +class PerfTiming: + def __init__(self, name): + self.ended = False + + self.name = name + self.start_time = time.time() + self.last_measure_time = self.start_time + self.last_measure_label = "Start" + self.timings = [] + + def measure(self, label): + if self.ended: + raise Exception("cannot measure a perf timing that is already ended") + current_time = time.time() + + time_since_start = int((current_time - self.start_time) * 1000) + time_since_last = int((current_time - self.last_measure_time) * 1000) + + self.timings.append((label, time_since_last, time_since_start)) + + self.last_measure_time = current_time + self.last_measure_label = label + + def end(self): + self.ended = True + current_time = time.time() + time_since_start = int((current_time - self.start_time) * 1000) + message = f'{self.name}(TOTAL) {time_since_start}ms' + logging.info(message) + for label, time_since_last, time_since_start in self.timings: + logging.info(f' {self.name}({label}) +{time_since_last}ms {time_since_start}ms') + +def log_function_timing(func): + @wraps(func) + def function_timing_wrapper(*args, **kwargs): + start_time = time.perf_counter() + result = func(*args, **kwargs) + end_time = time.perf_counter() + total_time = int((end_time - start_time) * 1000) + logging.info(f'Function "{func.__name__}()" took {total_time}ms') + return result + return function_timing_wrapper diff --git a/holmes/core/supabase_dal.py b/holmes/core/supabase_dal.py index 0d48f2bd..819195c4 100644 --- a/holmes/core/supabase_dal.py +++ b/holmes/core/supabase_dal.py @@ -60,6 +60,7 @@ def __init__(self): return logging.info(f"Initializing robusta store for account {self.account_id}") options = ClientOptions(postgrest_client_timeout=SUPABASE_TIMEOUT_SECONDS) + self.client = create_client(self.url, self.api_key, options) self.user_id = self.sign_in() ttl = int(os.environ.get("SAAS_SESSION_TOKEN_TTL_SEC", "82800")) # 23 hours @@ -244,7 +245,7 @@ def get_global_instructions_for_account(self) -> Optional[Instructions]: self.client .table(RUNBOOKS_TABLE) .select("runbook") - .eq("account_id", self.account_id) + .eq("account_id", self.account_id) .eq("subject_type", "Account") .execute() ) @@ -254,7 +255,7 @@ def get_global_instructions_for_account(self) -> Optional[Instructions]: return Instructions(instructions=instructions) except Exception: logging.exception("Failed to fetch global instructions", exc_info=True) - + return None def create_session_token(self) -> str: @@ -344,18 +345,18 @@ def upsert_holmes_status(self, holmes_status_data: dict) -> None: .execute() ) except Exception as error: - logging.error(f"Error happened during upserting holmes status: {error}", + logging.error(f"Error happened during upserting holmes status: {error}", exc_info=True) return None - + def sync_toolsets(self, toolsets: list[dict], cluster_name: str) -> None: if not toolsets: logging.warning("No toolsets were provided for synchronization.") return - + provided_toolset_names = [toolset['toolset_name'] for toolset in toolsets] - + try: self.client.table(HOLMES_TOOLSET).upsert( toolsets, @@ -364,8 +365,8 @@ def sync_toolsets(self, toolsets: list[dict], cluster_name: str) -> None: logging.info("Toolsets upserted successfully.") - - self.client.table(HOLMES_TOOLSET).delete().eq("account_id", + + self.client.table(HOLMES_TOOLSET).delete().eq("account_id", self.account_id).eq( 'cluster_id', cluster_name).not_.in_( 'toolset_name', provided_toolset_names diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py index 171e6535..693c49d1 100644 --- a/holmes/core/tool_calling_llm.py +++ b/holmes/core/tool_calling_llm.py @@ -20,6 +20,7 @@ from holmes.core.runbooks import RunbookManager from holmes.core.tools import ToolExecutor +from holmes.core.perf_timing import PerfTiming class ToolCallResult(BaseModel): tool_call_id: str @@ -102,11 +103,14 @@ def call( response_format: dict = None, user_prompt: Optional[str] = None, ) -> LLMResult: - + perf_timing = PerfTiming("tool_calling_llm.call") tool_calls = [] tools = self.tool_executor.get_all_tools_openai_format() + perf_timing.measure("tool_executor.get_all_tools_openai_format") + if len(tools) < 1: + tools = NOT_GIVEN for i in range(self.max_steps): - logging.debug(f"running iteration {i}") + logging.info(f"running iteration {i}") # on the last step we don't allow tools - we want to force a reply, not a request to run another tool tools = NOT_GIVEN if i == self.max_steps - 1 else tools tool_choice = NOT_GIVEN if tools == NOT_GIVEN else "auto" @@ -114,12 +118,13 @@ def call( total_tokens = self.llm.count_tokens_for_message(messages) max_context_size = self.llm.get_context_window_size() maximum_output_token = self.llm.get_maximum_output_token() - + perf_timing.measure("llm count tokens") if (total_tokens + maximum_output_token) > max_context_size: logging.warning("Token limit exceeded. Truncating tool responses.") messages = self.truncate_messages_to_fit_context( messages, max_context_size, maximum_output_token ) + perf_timing.measure("truncate_messages_to_fit_context") logging.debug(f"sending messages={messages}\n\ntools={tools}") try: @@ -131,6 +136,7 @@ def call( response_format=response_format, drop_params=True, ) + perf_timing.measure("llm.completion") logging.debug(f"got response {full_response.to_json()}") # catch a known error that occurs with Azure and replace the error message with something more obvious to the user except BadRequestError as e: @@ -165,6 +171,8 @@ def call( user_prompt=post_process_prompt, ) + perf_timing.measure("_post_processing_call") + perf_timing.end() return LLMResult( result=post_processed_response, unprocessed_result=raw_response, @@ -172,7 +180,7 @@ def call( prompt=json.dumps(messages, indent=2), messages=messages, ) - + perf_timing.end() return LLMResult( result=response_message.content, tool_calls=tool_calls, @@ -196,10 +204,14 @@ def call( "content": tool_call_result.result, } ) + perf_timing.measure(f"tool_call({tool_call_result.tool_name}/{tool_call_result.tool_call_id})") + + perf_timing.measure(f"iteration {i}") def _invoke_tool( self, tool_to_call: ChatCompletionMessageToolCall ) -> ToolCallResult: + t = PerfTiming("tool_calling_llm.invoke_tool") tool_name = tool_to_call.function.name tool_params = None try: @@ -208,14 +220,16 @@ def _invoke_tool( logging.warning( f"Failed to parse arguments for tool: {tool_name}. args: {tool_to_call.function.arguments}" ) - + t.measure("json.loads") tool_call_id = tool_to_call.id tool = self.tool_executor.get_tool_by_name(tool_name) + t.measure("tool_executor.get_tool_by_name") if (not tool) or (tool_params is None): logging.warning( f"Skipping tool execution for {tool_name}: args: {tool_to_call.function.arguments}" ) + t.end() return ToolCallResult( tool_call_id=tool_call_id, tool_name=tool_name, @@ -224,7 +238,8 @@ def _invoke_tool( ) tool_response = tool.invoke(tool_params) - + t.measure("tool.invoke") + t.end() return ToolCallResult( tool_call_id=tool_call_id, tool_name=tool_name, @@ -331,26 +346,27 @@ def investigate( self, issue: Issue, prompt: str, - console: Console, instructions: Optional[ResourceInstructions], + console: Optional[Console] = None, global_instructions: Optional[Instructions] = None, post_processing_prompt: Optional[str] = None, ) -> LLMResult: + t = PerfTiming("tool_Calling_llm.investigate") runbooks = self.runbook_manager.get_instructions_for_issue(issue) - + t.measure("runbook_manager.get_instructions_for_issue") if instructions != None and instructions.instructions: runbooks.extend(instructions.instructions) - if runbooks: + if console and runbooks: console.print( f"[bold]Analyzing with {len(runbooks)} runbooks: {runbooks}[/bold]" ) - else: + elif console: console.print( - f"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]" + "[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]" ) system_prompt = load_and_render_prompt(prompt, {"issue": issue}) - + t.measure("load_and_render_prompt") if instructions != None and len(instructions.documents) > 0: docPrompts = [] for document in instructions.documents: @@ -359,16 +375,17 @@ def investigate( ) runbooks.extend(docPrompts) + t.measure("extend runbooks") user_prompt = "" if runbooks: for runbook_str in runbooks: user_prompt += f"* {runbook_str}\n" user_prompt = f'My instructions to check \n"""{user_prompt}"""' - + if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0: user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n" - + user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}" logging.debug( @@ -376,6 +393,9 @@ def investigate( ) logging.debug("Rendered user prompt:\n%s", textwrap.indent(user_prompt, " ")) + t.measure("built prompts") res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt) + t.measure("prompt_call") res.instructions = runbooks + t.end() return res diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py index dca987cb..d4842e45 100644 --- a/holmes/plugins/toolsets/__init__.py +++ b/holmes/plugins/toolsets/__init__.py @@ -14,6 +14,7 @@ from pydantic import BaseModel from typing import Optional import yaml +from holmes.core.perf_timing import PerfTiming THIS_DIR = os.path.abspath(os.path.dirname(__file__)) @@ -46,6 +47,7 @@ def load_python_toolsets(dal:Optional[SupabaseDal], grafana_config:GrafanaConfig def load_builtin_toolsets(dal:Optional[SupabaseDal] = None, grafana_config:GrafanaConfig = GrafanaConfig()) -> List[Toolset]: + t = PerfTiming("load_builtin_toolsets") all_toolsets = [] logging.debug(f"loading toolsets from {THIS_DIR}") for filename in os.listdir(THIS_DIR): @@ -53,6 +55,11 @@ def load_builtin_toolsets(dal:Optional[SupabaseDal] = None, grafana_config:Grafa continue path = os.path.join(THIS_DIR, filename) all_toolsets.extend(load_toolsets_from_file(path)) + t.measure(f"load_toolsets_from_file:{filename}") + t.measure("all:load_toolsets_from_file") + python_toolsets = load_python_toolsets(dal, grafana_config) - all_toolsets.extend(load_python_toolsets(dal, grafana_config)) + t.measure("load_python_toolsets") + all_toolsets.extend(python_toolsets) + t.end() return all_toolsets diff --git a/holmes/plugins/toolsets/findings.py b/holmes/plugins/toolsets/findings.py index 4d290edb..d5134513 100644 --- a/holmes/plugins/toolsets/findings.py +++ b/holmes/plugins/toolsets/findings.py @@ -4,7 +4,7 @@ from typing import Optional from typing_extensions import Dict from holmes.core.supabase_dal import SupabaseDal -from holmes.core.tools import StaticPrerequisite, Tool, ToolParameter, Toolset +from holmes.core.tools import StaticPrerequisite, Tool, ToolParameter, Toolset, ToolsetTag PARAM_FINDING_ID = "id" @@ -72,5 +72,5 @@ def __init__(self, dal: Optional[SupabaseDal]): name="robusta", prerequisites=[dal_prereq], tools=[FetchRobustaFinding(dal)], - tags=["core",] + tags=[ToolsetTag.CORE,] ) diff --git a/holmes/plugins/toolsets/grafana_loki.py b/holmes/plugins/toolsets/grafana_loki.py index fb5d9644..8a7e76d2 100644 --- a/holmes/plugins/toolsets/grafana_loki.py +++ b/holmes/plugins/toolsets/grafana_loki.py @@ -180,4 +180,3 @@ def __init__(self, config: GrafanaLokiConfig): ], tags = [ToolsetTag.CORE, ] ) - self.check_prerequisites() diff --git a/holmes/plugins/toolsets/internet.py b/holmes/plugins/toolsets/internet.py index 2eb659e1..d3bf5a7b 100644 --- a/holmes/plugins/toolsets/internet.py +++ b/holmes/plugins/toolsets/internet.py @@ -2,7 +2,7 @@ import logging from typing import Any -from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetCommandPrerequisite +from holmes.core.tools import Tool, ToolParameter, Toolset, ToolsetCommandPrerequisite, ToolsetTag from markdownify import markdownify from playwright.sync_api import Error as PlaywrightError from playwright.sync_api import TimeoutError as PlaywrightTimeoutError @@ -167,5 +167,5 @@ def __init__(self): ), ], tools=[FetchWebpage()], - tags=["core",] + tags=[ToolsetTag.CORE,] ) diff --git a/holmes/utils/global_instructions.py b/holmes/utils/global_instructions.py index 479e52be..226a060b 100644 --- a/holmes/utils/global_instructions.py +++ b/holmes/utils/global_instructions.py @@ -1,7 +1,8 @@ +from typing import Optional from holmes.core.tool_calling_llm import Instructions -def add_global_instructions_to_user_prompt(user_prompt: str, global_instructions: Instructions) -> None: +def add_global_instructions_to_user_prompt(user_prompt: str, global_instructions: Optional[Instructions]) -> str: if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0: user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n" - return user_prompt \ No newline at end of file + return user_prompt diff --git a/holmes/utils/holmes_sync_toolsets.py b/holmes/utils/holmes_sync_toolsets.py index 591cc7d3..d3b91170 100644 --- a/holmes/utils/holmes_sync_toolsets.py +++ b/holmes/utils/holmes_sync_toolsets.py @@ -8,18 +8,18 @@ from holmes.plugins.prompts import load_and_render_prompt from holmes.utils.definitions import CUSTOM_TOOLSET_LOCATION import logging -from datetime import datetime +from datetime import datetime def load_custom_toolsets_config() -> list[ToolsetYamlFromConfig]: """ Loads toolsets config from /etc/holmes/config/custom_toolset.yaml with ToolsetYamlFromConfig class - that doesn't have strict validations. + that doesn't have strict validations. Example configuration: kubernetes/logs: enabled: false - + test/configurations: enabled: true icon_url: "example.com" @@ -55,13 +55,13 @@ def merge_and_override_bultin_toolsets_with_toolsets_config( default_toolsets_by_name: dict[str, YAMLToolset], ) -> dict[str, YAMLToolset]: """ - Merges and overrides default_toolsets_by_name with custom + Merges and overrides default_toolsets_by_name with custom config from /etc/holmes/config/custom_toolset.yaml """ toolsets_with_updated_statuses = { toolset.name: toolset for toolset in default_toolsets_by_name.values() } - + for toolset in toolsets_loaded_from_config: if toolset.name in toolsets_with_updated_statuses.keys(): toolsets_with_updated_statuses[toolset.name].override_with(toolset) @@ -73,7 +73,7 @@ def merge_and_override_bultin_toolsets_with_toolsets_config( logging.error( f"Toolset '{toolset.name}' is invalid: {error} ", exc_info=True ) - + return toolsets_with_updated_statuses @@ -99,15 +99,14 @@ def holmes_sync_toolsets_status(dal: SupabaseDal, config) -> None: # we check every toolset and save to local config toolsets which have passed the checks # before we try to upsert anything to db for toolset in toolsets_for_sync_by_name.values(): - if toolset.enabled: - toolset.check_prerequisites() - + toolset.check_prerequisites() + config.enabled_toolsets_names = [toolset.name for toolset in toolsets_for_sync_by_name.values() if toolset.get_status() == ToolsetStatusEnum.ENABLED] if not config.cluster_name: raise Exception("Cluster name is missing in the configuration. Please ensure 'CLUSTER_NAME' is defined in the environment variables, " "or verify that a cluster name is provided in the Robusta configuration file.") - + db_toolsets = [] updated_at = datetime.now().isoformat() for toolset in toolsets_for_sync_by_name.values(): diff --git a/poetry.lock b/poetry.lock index a28d9046..79f98040 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1996,6 +1996,36 @@ files = [ {file = "protobuf-5.29.2.tar.gz", hash = "sha256:b2cc8e8bb7c9326996f0e160137b0861f1a82162502658df2951209d0cb0309e"}, ] +[[package]] +name = "psutil" +version = "6.1.1" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"}, + {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"}, + {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"}, + {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"}, + {file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"}, + {file = "psutil-6.1.1-cp27-none-win32.whl", hash = "sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac"}, + {file = "psutil-6.1.1-cp27-none-win_amd64.whl", hash = "sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030"}, + {file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"}, + {file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160"}, + {file = "psutil-6.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3"}, + {file = "psutil-6.1.1-cp36-cp36m-win32.whl", hash = "sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603"}, + {file = "psutil-6.1.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303"}, + {file = "psutil-6.1.1-cp37-abi3-win32.whl", hash = "sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53"}, + {file = "psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649"}, + {file = "psutil-6.1.1.tar.gz", hash = "sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5"}, +] + +[package.extras] +dev = ["abi3audit", "black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "vulture", "wheel"] +test = ["pytest", "pytest-xdist", "setuptools"] + [[package]] name = "pyasn1" version = "0.6.1" @@ -3518,4 +3548,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d07c72494d8398a2e894a09b61258bfa6f51e191f0f445219e173c09237d68c6" +content-hash = "9397322146d6b7a087b1e7d0eb0ae7548c68cfd4cbb11f248e3a16b6c5e173d6" diff --git a/pyproject.toml b/pyproject.toml index 25913a99..26fabf47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ bs4 = "^0.0.2" markdownify = "^0.13.1" starlette = "^0.40" google-api-python-client = "^2.156.0" +psutil = "^6.1.1" [tool.poetry.group.dev.dependencies] pytest = "^8.3.3" diff --git a/server.py b/server.py index b1535d02..ddae3202 100644 --- a/server.py +++ b/server.py @@ -1,4 +1,5 @@ import os +import uuid from holmes.utils.cert_utils import add_custom_certificate ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "") @@ -14,10 +15,10 @@ import logging import uvicorn import colorlog +import time from litellm.exceptions import AuthenticationError -from fastapi import FastAPI, HTTPException -from rich.console import Console +from fastapi import FastAPI, HTTPException, Request from holmes.utils.robusta import load_robusta_api_key from holmes.common.env_vars import ( @@ -32,6 +33,7 @@ build_issue_chat_messages, handle_issue_conversation, ) +from holmes.core.perf_timing import PerfTiming from holmes.core.issue import Issue from holmes.core.models import ( InvestigationResult, @@ -46,7 +48,12 @@ from holmes.plugins.prompts import load_and_render_prompt from holmes.utils.holmes_sync_toolsets import holmes_sync_toolsets_status from holmes.utils.global_instructions import add_global_instructions_to_user_prompt +import string +import random +import tracemalloc +import psutil +tracemalloc.start() def init_logging(): logging_level = os.environ.get("LOG_LEVEL", "INFO") @@ -85,26 +92,71 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) -console = Console() +@app.middleware("http") +async def log_requests(request: Request, call_next): + request_id = str(uuid.uuid4()) + start_time = time.time() + logging.info( + f"Request received - ID: {request_id} - " + f"Method: {request.method} - " + f"URL: {request.url}" + ) + + response = await call_next(request) + + process_time = (time.time() - start_time) * 1000 + logging.info( + f"Request completed - ID: {request_id} - " + f"Status: {response.status_code} - " + f"Process Time: {process_time:.2f}ms" + ) + + return response + +base_investigate_snapshot = None @app.post("/api/investigate") def investigate_issues(investigate_request: InvestigateRequest): + t = PerfTiming("/api/investigate") + # print(f"POST /api/investigate {json.dumps(investigate_request)}") + global base_investigate_snapshot + try: result = investigation.investigate_issues( investigate_request=investigate_request, dal=dal, - config=config, - console=console + config=config ) + t.end() return result except AuthenticationError as e: raise HTTPException(status_code=401, detail=e.message) - - + finally: + log_memory_diff(base_investigate_snapshot) + +def log_memory_diff(base_snapshot): + if not base_snapshot: + return + id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=4)) + snapshot = tracemalloc.take_snapshot() + top_stats = snapshot.compare_to(base_snapshot, 'lineno') + + print(f"(*)(*)(*)(*) {id}") + for stat in top_stats[:10]: + print(f"(*)(*)(*)(*) ({id}) {stat}") + print(f"(*)(*)(*)(*) {id} END") + print(f"(*)(*)(*) MEM={psutil.Process().memory_info().rss / 1024 ** 2}MB") + +base_workload_health_check_snapshot = None @app.post("/api/workload_health_check") def workload_health_check(request: WorkloadHealthRequest): + global base_workload_health_check_snapshot + + if not base_workload_health_check_snapshot: + base_workload_health_check_snapshot = tracemalloc.take_snapshot() + load_robusta_api_key(dal=dal, config=config) try: resource = request.resource @@ -121,7 +173,7 @@ def workload_health_check(request: WorkloadHealthRequest): ) if stored_instructions: instructions.extend(stored_instructions.instructions) - + nl = "\n" if instructions: request.ask = f"{request.ask}\n My instructions for the investigation '''{nl.join(instructions)}'''" @@ -130,9 +182,9 @@ def workload_health_check(request: WorkloadHealthRequest): request.ask = add_global_instructions_to_user_prompt(request.ask, global_instructions) system_prompt = load_and_render_prompt(request.prompt_template, context={'alerts': workload_alerts}) - - ai = config.create_toolcalling_llm(console, dal=dal) + + ai = config.create_toolcalling_llm(dal=dal) structured_output = {"type": "json_object"} ai_call = ai.prompt_call( @@ -146,14 +198,15 @@ def workload_health_check(request: WorkloadHealthRequest): ) except AuthenticationError as e: raise HTTPException(status_code=401, detail=e.message) - + finally: + log_memory_diff(base_workload_health_check_snapshot) # older api that does not support conversation history @app.post("/api/conversation") -def issue_conversation(conversation_request: ConversationRequest): +def issue_conversation_deprecated(conversation_request: ConversationRequest): try: load_robusta_api_key(dal=dal, config=config) - ai = config.create_toolcalling_llm(console, dal=dal) + ai = config.create_toolcalling_llm(dal=dal) system_prompt = handle_issue_conversation(conversation_request, ai) @@ -171,7 +224,7 @@ def issue_conversation(conversation_request: ConversationRequest): def issue_conversation(issue_chat_request: IssueChatRequest): try: load_robusta_api_key(dal=dal, config=config) - ai = config.create_toolcalling_llm(console, dal=dal) + ai = config.create_toolcalling_llm(dal=dal) global_instructions = dal.get_global_instructions_for_account() messages = build_issue_chat_messages(issue_chat_request, ai, global_instructions) @@ -186,19 +239,32 @@ def issue_conversation(issue_chat_request: IssueChatRequest): raise HTTPException(status_code=401, detail=e.message) +base_chat_snapshot = None @app.post("/api/chat") def chat(chat_request: ChatRequest): + t = PerfTiming("/api/chat") + global base_chat_snapshot + + if not base_chat_snapshot: + base_chat_snapshot = tracemalloc.take_snapshot() + t.measure("tracemalloc.take_snapshot") try: load_robusta_api_key(dal=dal, config=config) + t.measure("load_robusta_api_key") + ai = config.create_toolcalling_llm(dal=dal) - ai = config.create_toolcalling_llm(console, dal=dal) + t.measure("config.create_toolcalling_llm") global_instructions = dal.get_global_instructions_for_account() + t.measure("dal.get_global_instructions_for_account") messages = build_chat_messages( chat_request.ask, chat_request.conversation_history, ai=ai, global_instructions=global_instructions ) + t.measure("build_chat_messages") llm_call = ai.messages_call(messages=messages) + t.measure("ai.messages_call") + t.end() return ChatResponse( analysis=llm_call.result, tool_calls=llm_call.tool_calls, @@ -206,6 +272,8 @@ def chat(chat_request: ChatRequest): ) except AuthenticationError as e: raise HTTPException(status_code=401, detail=e.message) + finally: + log_memory_diff(base_chat_snapshot) @app.get("/api/model") @@ -214,4 +282,8 @@ def get_model(): if __name__ == "__main__": - uvicorn.run(app, host=HOLMES_HOST, port=HOLMES_PORT) + log_config = uvicorn.config.LOGGING_CONFIG + #log_config["formatters"]["access"]["fmt"] = "%(asctime)s - %(levelname)s - %(message)s" + log_config["formatters"]["access"]["fmt"] = "%(asctime)s %(levelname)-8s %(message)s" + log_config["formatters"]["default"]["fmt"] = "%(asctime)s %(levelname)-8s %(message)s" + uvicorn.run(app, host=HOLMES_HOST, port=HOLMES_PORT, log_config=log_config) diff --git a/test-api_chat.sh b/test-api_chat.sh new file mode 100755 index 00000000..96856717 --- /dev/null +++ b/test-api_chat.sh @@ -0,0 +1,2 @@ +curl -XPOST 127.0.0.1:8000/api/chat -H "Content-Type: application/json" --data '{"ask": "and how many of these are unhealthy?", "conversation_history": [{"role": "system", "content": "You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions.\nWhenever possible you MUST first use tools to investigate then answer the question.\nDo not say - based on the tool output - or explicitly refer to tools at all.\nIf you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.\nIf you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly\n\nUse conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.\n\n\nIn general:\n* when it can provide extra information, first run as many tools as you need to gather more information, then respond.\n* if possible, do so repeatedly with different tool calls each time to gather more information.\n* do not stop investigating until you are at the final root cause you are able to find.\n* use the \"five whys\" methodology to find the root cause.\n* for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.\n* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.\n* in this case, try to find substrings or search for the correct spellings\n* always provide detailed information like exact resource names, versions, labels, etc\n* even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names\n* when giving an answer don t say root cause but \"possible root causes\" and be clear to distinguish between what you know for certain and what is a possible explanation\n* if a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.\n* if you don t know, say that the analysis was inconclusive.\n* if there are multiple possible causes list them in a numbered list.\n* there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.\n\nIf investigating Kubernetes problems:\n* run as many kubectl commands as you need to gather more information, then respond.\n* if possible, do so repeatedly on different Kubernetes objects.\n* for example, for deployments first run kubectl on the deployment then a replicaset inside it, then a pod inside that.\n* if the user wants to find a specific term in a pod s logs, use kubectl_logs_grep\n* use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream\n* when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs\n* do not give an answer like \"The pod is pending\" as that doesn t state why the pod is pending and how to fix it.\n* do not give an answer like \"Pod s node affinity/selector doesn t match any available nodes\" because that doesn t include data on WHICH label doesn t match\n* if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant\n* if the user says something isn t working, ALWAYS:\n** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to\n** check the application aspects through the logs (kubectl_logs and kubectl_previous_logs) and other relevant tools\n** look for misconfigured ingresses/services etc\n\nHandling Permission Errors\nIf during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution:\n1.**Analyze the Error Message**\n - Identify the missing resource, API group, and verbs from the error details.\n - Never stop at reporting the error\n - Proceed with an in-depth investigation.\n2.**Locate the Relevant Helm Release**\nCheck if Helm tools are available, if they are available always use Helm commands to help user find the release associated with the Holmes pod:\n - Run `helm list -A | grep holmes` to identify the release name.\n - Run `helm get values -n ` to retrieve details such as `customClusterRoleRules` and `clusterName`.\nIf Helm tools are unavailable, skip this step.\n3. **Check for Missing Permissions**\n - Check for a cluster role with -holmes-cluster-role in its name and a service account with -holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn t found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)\n - Focus on identifying absent permissions that align with the error message.\n4. **Update the Configuration**\nIf necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: \n```\nholmes:\n customClusterRoleRules:\n - apiGroups: [\"\"]\n resources: [\"\", \"\"]\n verbs: [\"\", \"\", \"\"]\n```\nAfter that instruct them to apply the changes with::\n```\n helm upgrade robusta/robusta --values=generated_values.yaml --set clusterName=\n```\n5. **Fallback Guidelines**\n- If you cannot determine the release or cluster name, use placeholders `` and ``.\n- While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves.\nReminder:\n* Always adhere to this process, even if Helm tools are unavailable.\n* Strive for thoroughness and precision, ensuring the issue is fully addressed.\n\nSpecial cases and how to reply:\n* if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to \"I don t have access to
. Please add a Holmes integration for so that I can investigate this.\"\n* make sure you differentiate between \"I investigated and found error X caused this problem\" and \"I tried to investigate but while investigating I got some errors that prevented me from completing the investigation.\"\n* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.\n* that is different than - for example - fetching a pod s logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details\n* Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`.\n* For any question, try to make the answer specific to the user s cluster.\n** For example, if asked to port forward, find out the app or pod port (kubectl decribe) and provide a port forward command specific to the user s question\n\n\nStyle guide:\n* Reply with terse output.\n* Be painfully concise.\n* Leave out \"the\" and filler words when possible.\n* Be terse but not at the expense of leaving out important data like the root cause and how to fix.\n\nExamples:\n\nUser: Why did the webserver-example app crash?\n(Call tool kubectl_find_resource kind=pod keyword=webserver`)\n(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call)\n\nAI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user\nRelevant logs:\n\n```\n2021-01-01T00:00:00.000Z [ERROR] Missing required field email in request body\n```\n\nValidation error led to unhandled Java exception causing a crash."}, {"role": "user", "content": "Hi"}, {"content": "Hello! How can I assist you today?", "role": "assistant"}, {"role": "user", "content": "How many pods are running?"}, {"role": "assistant", "tool_calls": [{"function": {"arguments": "{\"kind\":\"pod\"}", "name": "kubectl_get_by_kind_in_cluster"}, "id": "call_ZMgO2X6GrMVF16YbLW02lHZW", "type": "function"}]}, {"tool_call_id": "call_ZMgO2X6GrMVF16YbLW02lHZW", "role": "tool", "name": "kubectl_get_by_kind_in_cluster", "content": "stdout:\nNAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES LABELS\nargocd argocd-application-controller-0 1/1 Running 9 (111m ago) 29d 10.244.0.12 grafana-cloud-control-plane app.kubernetes.io/name=argocd-application-controller,apps.kubernetes.io/pod-index=0,controller-revision-hash=argocd-application-controller-77d8549fd,statefulset.kubernetes.io/pod-name=argocd-application-controller-0\nargocd argocd-applicationset-controller-7ff94fc879-lfrcp 1/1 Running 9 (111m ago) 29d 10.244.0.5 grafana-cloud-control-plane app.kubernetes.io/name=argocd-applicationset-controller,pod-template-hash=7ff94fc879\nargocd argocd-dex-server-84b879d87c-xpwvh 1/1 Running 9 (111m ago) 29d 10.244.0.17 grafana-cloud-control-plane app.kubernetes.io/name=argocd-dex-server,pod-template-hash=84b879d87c\nargocd argocd-notifications-controller-6c65b4b9f6-5z2nn 1/1 Running 9 (111m ago) 29d 10.244.0.7 grafana-cloud-control-plane app.kubernetes.io/name=argocd-notifications-controller,pod-template-hash=6c65b4b9f6\nargocd argocd-redis-868dbb7cf4-kwjg2 1/1 Running 9 (111m ago) 29d 10.244.0.6 grafana-cloud-control-plane app.kubernetes.io/name=argocd-redis,pod-template-hash=868dbb7cf4\nargocd argocd-repo-server-6d47848766-9l5kf 1/1 Running 9 (111m ago) 29d 10.244.0.16 grafana-cloud-control-plane app.kubernetes.io/name=argocd-repo-server,pod-template-hash=6d47848766\nargocd argocd-server-c9f58d8cf-5wz2j 1/1 Running 16 (111m ago) 29d 10.244.0.15 grafana-cloud-control-plane app.kubernetes.io/name=argocd-server,pod-template-hash=c9f58d8cf\ndefault alertmanager-robusta-kube-prometheus-st-alertmanager-0 2/2 Running 4 (111m ago) 2d17h 10.244.0.24 grafana-cloud-control-plane alertmanager=robusta-kube-prometheus-st-alertmanager,app.kubernetes.io/instance=robusta-kube-prometheus-st-alertmanager,app.kubernetes.io/managed-by=prometheus-operator,app.kubernetes.io/name=alertmanager,app.kubernetes.io/version=0.26.0,apps.kubernetes.io/pod-index=0,controller-revision-hash=alertmanager-robusta-kube-prometheus-st-alertmanager-57cd7fb46f,statefulset.kubernetes.io/pod-name=alertmanager-robusta-kube-prometheus-st-alertmanager-0\ndefault customer-orders-766b65899b-hfpss 2/2 Running 18 (111m ago) 30d 10.244.0.4 grafana-cloud-control-plane app=customer-orders,pod-template-hash=766b65899b\ndefault grafana-k8s-monitoring-alloy-0 2/2 Running 21 (110m ago) 30d 10.244.0.2 grafana-cloud-control-plane app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/name=alloy,apps.kubernetes.io/pod-index=0,controller-revision-hash=grafana-k8s-monitoring-alloy-7cf58b875b,statefulset.kubernetes.io/pod-name=grafana-k8s-monitoring-alloy-0\ndefault grafana-k8s-monitoring-alloy-events-799cc88c88-9pllx 2/2 Running 22 (110m ago) 30d 10.244.0.3 grafana-cloud-control-plane app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/name=alloy-events,pod-template-hash=799cc88c88\ndefault grafana-k8s-monitoring-alloy-logs-t4ktj 2/2 Running 22 (110m ago) 30d 10.244.0.13 grafana-cloud-control-plane app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/name=alloy-logs,controller-revision-hash=fdd8b877b,pod-template-generation=1\ndefault grafana-k8s-monitoring-kepler-2hntq 1/1 Running 9 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane app.kubernetes.io/component=exporter,app.kubernetes.io/name=kepler,controller-revision-hash=7b459fdb69,pod-template-generation=1\ndefault grafana-k8s-monitoring-kube-state-metrics-5ff5b4947b-b7l57 1/1 Running 20 (110m ago) 30d 10.244.0.8 grafana-cloud-control-plane app.kubernetes.io/component=metrics,app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=kube-state-metrics,app.kubernetes.io/part-of=kube-state-metrics,app.kubernetes.io/version=2.14.0,helm.sh/chart=kube-state-metrics-5.27.0,pod-template-hash=5ff5b4947b,release=grafana-k8s-monitoring\ndefault grafana-k8s-monitoring-opencost-5b67d55db5-nwp7l 1/1 Running 9 (111m ago) 30d 10.244.0.14 grafana-cloud-control-plane app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/name=opencost,pod-template-hash=5b67d55db5\ndefault grafana-k8s-monitoring-prometheus-node-exporter-wjb7d 1/1 Running 9 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane app.kubernetes.io/component=metrics,app.kubernetes.io/instance=grafana-k8s-monitoring,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=prometheus-node-exporter,app.kubernetes.io/part-of=prometheus-node-exporter,app.kubernetes.io/version=1.8.2,controller-revision-hash=869855d879,helm.sh/chart=prometheus-node-exporter-4.42.0,pod-template-generation=1,release=grafana-k8s-monitoring\ndefault kafka-client 0/1 Unknown 0 28d grafana-cloud-control-plane run=kafka-client\ndefault kafka-consumer 0/1 Unknown 0 28d grafana-cloud-control-plane run=kafka-consumer\ndefault kafka-controller-0 1/1 Running 18 (111m ago) 28d 10.244.0.26 grafana-cloud-control-plane app.kubernetes.io/component=controller-eligible,app.kubernetes.io/instance=kafka,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=kafka,app.kubernetes.io/part-of=kafka,app.kubernetes.io/version=3.9.0,apps.kubernetes.io/pod-index=0,controller-revision-hash=kafka-controller-58645b745b,helm.sh/chart=kafka-31.1.0,statefulset.kubernetes.io/pod-name=kafka-controller-0\ndefault kafka-controller-1 1/1 Running 16 (111m ago) 28d 10.244.0.23 grafana-cloud-control-plane app.kubernetes.io/component=controller-eligible,app.kubernetes.io/instance=kafka,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=kafka,app.kubernetes.io/part-of=kafka,app.kubernetes.io/version=3.9.0,apps.kubernetes.io/pod-index=1,controller-revision-hash=kafka-controller-58645b745b,helm.sh/chart=kafka-31.1.0,statefulset.kubernetes.io/pod-name=kafka-controller-1\ndefault kafka-controller-2 1/1 Running 19 (111m ago) 28d 10.244.0.28 grafana-cloud-control-plane app.kubernetes.io/component=controller-eligible,app.kubernetes.io/instance=kafka,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=kafka,app.kubernetes.io/part-of=kafka,app.kubernetes.io/version=3.9.0,apps.kubernetes.io/pod-index=2,controller-revision-hash=kafka-controller-58645b745b,helm.sh/chart=kafka-31.1.0,statefulset.kubernetes.io/pod-name=kafka-controller-2\ndefault kafka-producer 0/1 Unknown 0 28d grafana-cloud-control-plane run=kafka-producer\ndefault prometheus-robusta-kube-prometheus-st-prometheus-0 2/2 Running 4 (111m ago) 2d17h 10.244.0.27 grafana-cloud-control-plane app.kubernetes.io/instance=robusta-kube-prometheus-st-prometheus,app.kubernetes.io/managed-by=prometheus-operator,app.kubernetes.io/name=prometheus,app.kubernetes.io/version=2.48.1,apps.kubernetes.io/pod-index=0,controller-revision-hash=prometheus-robusta-kube-prometheus-st-prometheus-55d87c869b,operator.prometheus.io/name=robusta-kube-prometheus-st-prometheus,operator.prometheus.io/shard=0,prometheus=robusta-kube-prometheus-st-prometheus,statefulset.kubernetes.io/pod-name=prometheus-robusta-kube-prometheus-st-prometheus-0\ndefault robusta-forwarder-5c5fdbbf57-z6sxp 1/1 Running 2 (111m ago) 2d17h 10.244.0.21 grafana-cloud-control-plane app=robusta-forwarder,pod-template-hash=5c5fdbbf57\ndefault robusta-grafana-8588b8fb85-dw8rn 3/3 Running 6 (111m ago) 2d17h 10.244.0.20 grafana-cloud-control-plane app.kubernetes.io/instance=robusta,app.kubernetes.io/name=grafana,pod-template-hash=8588b8fb85\ndefault robusta-holmes-775d7ddcb9-f4jxg 1/1 Running 0 50s 10.244.0.49 grafana-cloud-control-plane app=holmes,pod-template-hash=775d7ddcb9\ndefault robusta-kube-prometheus-st-operator-6885c8f675-7wqpd 1/1 Running 5 (110m ago) 2d17h 10.244.0.18 grafana-cloud-control-plane app.kubernetes.io/instance=robusta,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/part-of=kube-prometheus-stack,app.kubernetes.io/version=55.7.0,app=kube-prometheus-stack-operator,chart=kube-prometheus-stack-55.7.0,heritage=Helm,pod-template-hash=6885c8f675,release=robusta\ndefault robusta-kube-state-metrics-8667fd9775-7dnqk 1/1 Running 6 (110m ago) 2d17h 10.244.0.19 grafana-cloud-control-plane app.kubernetes.io/component=metrics,app.kubernetes.io/instance=robusta,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=kube-state-metrics,app.kubernetes.io/part-of=kube-state-metrics,app.kubernetes.io/version=2.10.1,helm.sh/chart=kube-state-metrics-5.15.3,pod-template-hash=8667fd9775,release=robusta\ndefault robusta-prometheus-node-exporter-9dczn 1/1 Running 2 (111m ago) 2d17h 172.18.0.3 grafana-cloud-control-plane app.kubernetes.io/component=metrics,app.kubernetes.io/instance=robusta,app.kubernetes.io/managed-by=Helm,app.kubernetes.io/name=prometheus-node-exporter,app.kubernetes.io/part-of=prometheus-node-exporter,app.kubernetes.io/version=1.7.0,controller-revision-hash=7b4f94f6bf,helm.sh/chart=prometheus-node-exporter-4.24.0,jobLabel=node-exporter,pod-template-generation=1,release=robusta\ndefault robusta-runner-64c4d84845-p725f 1/1 Running 1 (111m ago) 43h 10.244.0.22 grafana-cloud-control-plane app=robusta-runner,pod-template-hash=64c4d84845,robustaComponent=runner\nkube-system coredns-7c65d6cfc9-h65ff 1/1 Running 9 (111m ago) 30d 10.244.0.10 grafana-cloud-control-plane k8s-app=kube-dns,pod-template-hash=7c65d6cfc9\nkube-system coredns-7c65d6cfc9-v6lvm 1/1 Running 9 (111m ago) 30d 10.244.0.11 grafana-cloud-control-plane k8s-app=kube-dns,pod-template-hash=7c65d6cfc9\nkube-system etcd-grafana-cloud-control-plane 1/1 Running 0 111m 172.18.0.3 grafana-cloud-control-plane component=etcd,tier=control-plane\nkube-system kindnet-x9k26 1/1 Running 9 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane app=kindnet,controller-revision-hash=85f8898dd9,k8s-app=kindnet,pod-template-generation=1,tier=node\nkube-system kube-apiserver-grafana-cloud-control-plane 1/1 Running 0 111m 172.18.0.3 grafana-cloud-control-plane component=kube-apiserver,tier=control-plane\nkube-system kube-controller-manager-grafana-cloud-control-plane 1/1 Running 10 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane component=kube-controller-manager,tier=control-plane\nkube-system kube-proxy-s6qsc 1/1 Running 13 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane controller-revision-hash=77987969cc,k8s-app=kube-proxy,pod-template-generation=1\nkube-system kube-scheduler-grafana-cloud-control-plane 1/1 Running 10 (111m ago) 30d 172.18.0.3 grafana-cloud-control-plane component=kube-scheduler,tier=control-plane\nlocal-path-storage local-path-provisioner-57c5987fd4-cr8dx 1/1 Running 18 (110m ago) 30d 10.244.0.9 grafana-cloud-control-plane app=local-path-provisioner,pod-template-hash=57c5987fd4\n\nstderr:\n"}, {"content": "There are 36 pods currently running in the cluster.", "role": "assistant"}]}' +" diff --git a/tests/llm/test_investigate.py b/tests/llm/test_investigate.py index 7f0a3703..8c8b4bca 100644 --- a/tests/llm/test_investigate.py +++ b/tests/llm/test_investigate.py @@ -33,7 +33,7 @@ def __init__(self, test_case:InvestigateTestCase): self._test_case = test_case def create_tool_executor( - self, console: Console, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] + self, allowed_toolsets: ToolsetPattern, dal:Optional[SupabaseDal] ) -> ToolExecutor: mock = MockToolsets(generate_mocks=self._test_case.generate_mocks, test_case_folder=self._test_case.folder) diff --git a/tests/test_issue_investigator.py b/tests/test_issue_investigator.py index cefce210..a257b03e 100644 --- a/tests/test_issue_investigator.py +++ b/tests/test_issue_investigator.py @@ -29,9 +29,7 @@ def _test_investigate_issue_using_fetch_webpage(): ) console = Console() config = Config.load_from_env() - ai = config.create_issue_investigator( - console, allowed_toolsets='*' - ) + ai = config.create_issue_investigator() issue = Issue( id="", @@ -70,11 +68,8 @@ def _test_investigate_issue_without_fetch_webpage(): instructions=[], documents=[] ) - console = Console() config = Config.load_from_env() - ai = config.create_issue_investigator( - console, allowed_toolsets='*' - ) + ai = config.create_issue_investigator() issue = Issue( id="", @@ -87,7 +82,6 @@ def _test_investigate_issue_without_fetch_webpage(): investigation = ai.investigate( issue=issue, prompt=investigate_request.prompt_template, - console=console, post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT, instructions=resource_instructions, )