From d456d7b82b090eb700205b85d80c26362a97a82b Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Mon, 23 Dec 2024 13:43:45 +0100 Subject: [PATCH 1/5] Added permission error handling for general prompt --- Dockerfile | 14 ++++++++ holmes/core/conversations.py | 2 +- .../prompts/_general_instructions.jinja2 | 35 ++++++++++++++++++- .../prompts/generic_ask_conversation.jinja2 | 32 +++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 holmes/plugins/prompts/generic_ask_conversation.jinja2 diff --git a/Dockerfile b/Dockerfile index dc21b3e5..5f805af4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,6 +43,14 @@ RUN ./kube-lineage --version RUN curl -sSL -o argocd-linux-amd64 https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-amd64 +# Install Helm +RUN curl https://baltocdn.com/helm/signing.asc | gpg --dearmor -o /usr/share/keyrings/helm.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" \ + | tee /etc/apt/sources.list.d/helm-stable-debian.list \ + && apt-get update \ + && apt-get install -y helm \ + && rm -rf /var/lib/apt/lists/* + # Set up poetry ARG PRIVATE_PACKAGE_REGISTRY="none" RUN if [ "${PRIVATE_PACKAGE_REGISTRY}" != "none" ]; then \ @@ -92,10 +100,16 @@ RUN apt-get install -y kubectl COPY --from=builder /app/kube-lineage /usr/local/bin RUN kube-lineage --version +# Set up ArgoCD COPY --from=builder /app/argocd-linux-amd64 /usr/local/bin/argocd RUN chmod 555 /usr/local/bin/argocd RUN argocd --help +# Set up Helm +COPY --from=builder /usr/bin/helm /usr/local/bin/helm +RUN chmod 555 /usr/local/bin/helm +RUN helm version + ARG AWS_DEFAULT_PROFILE ARG AWS_DEFAULT_REGION ARG AWS_PROFILE diff --git a/holmes/core/conversations.py b/holmes/core/conversations.py index f0ba6155..94dead8f 100644 --- a/holmes/core/conversations.py +++ b/holmes/core/conversations.py @@ -304,7 +304,7 @@ def build_issue_chat_messages(issue_chat_request: IssueChatRequest, ai: ToolCall def build_chat_messages( ask: str, conversation_history: Optional[List[Dict[str, str]]], ai: ToolCallingLLM ) -> List[dict]: - template_path = "builtin://generic_ask.jinja2" + template_path = "builtin://generic_ask_conversation.jinja2" if not conversation_history or len(conversation_history) == 0: system_prompt = load_and_render_prompt(template_path, {}) diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2 index 52b8952c..1cf61b7a 100644 --- a/holmes/plugins/prompts/_general_instructions.jinja2 +++ b/holmes/plugins/prompts/_general_instructions.jinja2 @@ -29,10 +29,43 @@ If investigating Kubernetes problems: ** check the application aspects through the logs (kubectl_logs and kubectl_previous_logs) and other relevant tools ** look for misconfigured ingresses/services etc +Handling Permission Errors +If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution: +1. **Analyze the Error Message** + - Identify the missing resource, API group, and verbs from the error details. + - Never stop at reporting the error—proceed with an in-depth investigation. +2. **Locate the Relevant Helm Release** +Use Helm commands to find the release associated with the Holmes pod: + - Run `helm list -A | grep holmes` to identify the release name. + - Run `helm get values -n ` to retrieve details such as `customClusterRoleRules` and `clusterName`. + if Helm tools are unavailable, skip this step +3. **Check for Missing Permissions** +- Review the `holmes-cluster-role` cluster role and `holmes-service-account` service account for missing permissions. +- Focus on identifying absent permissions that align with the error message. +4. **Update the Configuration** + If necessary permissions are absent both in customClusterRoleRules and `holmes-cluster-role`, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: + ```yaml + holmes: + customClusterRoleRules: + - apiGroups: [""] + resources: ["", ""] + verbs: ["", "", ""] + ``` + After that instruct them to apply the changes with:: + ```bash + helm upgrade robusta/robusta --values=generated_values.yaml --set clusterName= + ``` +5. **Fallback Guidelines** +- If you cannot determine the release or cluster name, use placeholders `` and ``. +- While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves. +Reminder: +* Always adhere to this process, even if Helm tools are unavailable. +* Strive for thoroughness and precision, ensuring the issue is fully addressed. + Special cases and how to reply: * if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to
. Please add a Holmes integration for so that I can investigate this." * make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation." -* as a special case of that, if you try to investigate by running a tool and the tool gives you output that permissions are missing *to run the tool* then say "I tried to investigate but I am missing permissions to run the tool .
" +* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance. * that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details * Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`. * For any question, try to make the answer specific to the user's cluster. diff --git a/holmes/plugins/prompts/generic_ask_conversation.jinja2 b/holmes/plugins/prompts/generic_ask_conversation.jinja2 new file mode 100644 index 00000000..3ff45adb --- /dev/null +++ b/holmes/plugins/prompts/generic_ask_conversation.jinja2 @@ -0,0 +1,32 @@ +You are a tool-calling AI assist provided with common devops and IT tools that you can use to troubleshoot problems or answer questions. +Whenever possible you MUST first use tools to investigate then answer the question. +Do not say 'based on the tool output' or explicitly refer to tools at all. +If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time. +If you have a good and concrete suggestion for how the user can fix something, tell them even if not asked explicitly + +Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses. + + +{% include '_general_instructions.jinja2' %} + + +Style guide: +* Reply with terse output. +* Be painfully concise. +* Leave out "the" and filler words when possible. +* Be terse but not at the expense of leaving out important data like the root cause and how to fix. + +Examples: + +User: Why did the webserver-example app crash? +(Call tool kubectl_find_resource kind=pod keyword=webserver`) +(Call tool kubectl_previous_logs namespace=demos pod=webserver-example-1299492-d9g9d # this pod name was found from the previous tool call) + +AI: `webserver-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user +Relevant logs: + +``` +2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body +``` + +Validation error led to unhandled Java exception causing a crash. \ No newline at end of file From 2c077b645add3758294df7198a4e39caf20df763 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Mon, 23 Dec 2024 20:17:20 +0100 Subject: [PATCH 2/5] Aligned error handling in _general_instructions.jinja2 --- .../prompts/_general_instructions.jinja2 | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2 index 1cf61b7a..78f0878c 100644 --- a/holmes/plugins/prompts/_general_instructions.jinja2 +++ b/holmes/plugins/prompts/_general_instructions.jinja2 @@ -31,30 +31,30 @@ If investigating Kubernetes problems: Handling Permission Errors If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution: -1. **Analyze the Error Message** - - Identify the missing resource, API group, and verbs from the error details. - - Never stop at reporting the error—proceed with an in-depth investigation. -2. **Locate the Relevant Helm Release** +1.**Analyze the Error Message** + - Identify the missing resource, API group, and verbs from the error details. + - Never stop at reporting the error—proceed with an in-depth investigation. +2.**Locate the Relevant Helm Release** Use Helm commands to find the release associated with the Holmes pod: - - Run `helm list -A | grep holmes` to identify the release name. - - Run `helm get values -n ` to retrieve details such as `customClusterRoleRules` and `clusterName`. - if Helm tools are unavailable, skip this step + - Run `helm list -A | grep holmes` to identify the release name. + - Run `helm get values -n ` to retrieve details such as `customClusterRoleRules` and `clusterName`. +If Helm tools are unavailable, skip this step. 3. **Check for Missing Permissions** -- Review the `holmes-cluster-role` cluster role and `holmes-service-account` service account for missing permissions. -- Focus on identifying absent permissions that align with the error message. + - Review the `holmes-cluster-role` cluster role and `holmes-service-account` service account for missing permissions. + - Focus on identifying absent permissions that align with the error message. 4. **Update the Configuration** - If necessary permissions are absent both in customClusterRoleRules and `holmes-cluster-role`, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: - ```yaml - holmes: - customClusterRoleRules: - - apiGroups: [""] - resources: ["", ""] - verbs: ["", "", ""] - ``` - After that instruct them to apply the changes with:: - ```bash - helm upgrade robusta/robusta --values=generated_values.yaml --set clusterName= - ``` +If necessary permissions are absent both in customClusterRoleRules and `holmes-cluster-role`, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: +``` +holmes: + customClusterRoleRules: + - apiGroups: [""] + resources: ["", ""] + verbs: ["", "", ""] +``` +After that instruct them to apply the changes with:: +``` + helm upgrade robusta/robusta --values=generated_values.yaml --set clusterName= +``` 5. **Fallback Guidelines** - If you cannot determine the release or cluster name, use placeholders `` and ``. - While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves. From 0edcfabbc4c640848a0ffd7acd0c177558aa5580 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Tue, 24 Dec 2024 12:17:44 +0100 Subject: [PATCH 3/5] Fixed Handling Permission Errors's Analyze the Error Message --- holmes/plugins/prompts/_general_instructions.jinja2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2 index 78f0878c..26859b13 100644 --- a/holmes/plugins/prompts/_general_instructions.jinja2 +++ b/holmes/plugins/prompts/_general_instructions.jinja2 @@ -33,7 +33,8 @@ Handling Permission Errors If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution: 1.**Analyze the Error Message** - Identify the missing resource, API group, and verbs from the error details. - - Never stop at reporting the error—proceed with an in-depth investigation. + - Never stop at reporting the error + — Proceed with an in-depth investigation. 2.**Locate the Relevant Helm Release** Use Helm commands to find the release associated with the Holmes pod: - Run `helm list -A | grep holmes` to identify the release name. From f4f3ebc29b284119ccf8efd176cd669c6f452277 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Mon, 6 Jan 2025 15:10:01 +0100 Subject: [PATCH 4/5] Added evaluation test for no permission --- .../custom_resources.yaml | 32 +++++++++++++++++++ .../kubectl_find_resource.txt | 6 ++++ .../test_case.yaml | 8 +++++ 3 files changed, 46 insertions(+) create mode 100644 tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml create mode 100644 tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt create mode 100644 tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml new file mode 100644 index 00000000..5bb3243e --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml @@ -0,0 +1,32 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: myresources.example.com +spec: + group: example.com + names: + kind: MyResource + listKind: MyResourceList + plural: myresources + singular: myresource + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + field1: + type: string +--- +apiVersion: example.com/v1 +kind: MyResource +metadata: + name: my-resource-instance +spec: + field1: "value1" diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt new file mode 100644 index 00000000..e69e4924 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt @@ -0,0 +1,6 @@ +{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}} +Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1 +stdout: + +stderr: +Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml new file mode 100644 index 00000000..3a4e29cb --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml @@ -0,0 +1,8 @@ +user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance" +expected_output: "To resolve this, you need to update the permissions for the robusta-holmes-service-account." +before_test: kubectl apply -f./custom_resources.yaml +after_test: kubectl delete -f./custom_resources.yaml +evaluation: + answer_relevancy: .5 + faithfulness: .5 + contextual_precision: 0 From 732d222001b1e7926237a99e9a083379ff24a2b7 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Mon, 6 Jan 2025 21:55:23 +0100 Subject: [PATCH 5/5] Added case for permission error when helm tools are available; adjusted prompt --- .../prompts/_general_instructions.jinja2 | 8 ++--- .../test_case.yaml | 4 ++- .../custom_resources.yaml | 32 +++++++++++++++++++ .../helm_list.txt | 6 ++++ .../helm_values.txt | 6 ++++ .../kubectl_find_resource.txt | 6 ++++ .../test_case.yaml | 10 ++++++ 7 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml create mode 100644 tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt create mode 100644 tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt create mode 100644 tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt create mode 100644 tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2 index 26859b13..9f1dabb5 100644 --- a/holmes/plugins/prompts/_general_instructions.jinja2 +++ b/holmes/plugins/prompts/_general_instructions.jinja2 @@ -34,17 +34,17 @@ If during the investigation you encounter a permissions error (e.g., `Error from 1.**Analyze the Error Message** - Identify the missing resource, API group, and verbs from the error details. - Never stop at reporting the error - — Proceed with an in-depth investigation. + - Proceed with an in-depth investigation. 2.**Locate the Relevant Helm Release** -Use Helm commands to find the release associated with the Holmes pod: +Check if Helm tools are available, if they are available always use Helm commands to help user find the release associated with the Holmes pod: - Run `helm list -A | grep holmes` to identify the release name. - Run `helm get values -n ` to retrieve details such as `customClusterRoleRules` and `clusterName`. If Helm tools are unavailable, skip this step. 3. **Check for Missing Permissions** - - Review the `holmes-cluster-role` cluster role and `holmes-service-account` service account for missing permissions. + - Check for a cluster role with -holmes-cluster-role in its name and a service account with -holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.) - Focus on identifying absent permissions that align with the error message. 4. **Update the Configuration** -If necessary permissions are absent both in customClusterRoleRules and `holmes-cluster-role`, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: +If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: ``` holmes: customClusterRoleRules: diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml index 3a4e29cb..fbcbd32f 100644 --- a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml +++ b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml @@ -1,5 +1,7 @@ user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance" -expected_output: "To resolve this, you need to update the permissions for the robusta-holmes-service-account." +expected_output: + - Modify the generated_values.yaml file to include the missing permissions + - helm upgrade robusta/robusta --values=generated_values.yaml --set clusterName= before_test: kubectl apply -f./custom_resources.yaml after_test: kubectl delete -f./custom_resources.yaml evaluation: diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml new file mode 100644 index 00000000..5bb3243e --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml @@ -0,0 +1,32 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: myresources.example.com +spec: + group: example.com + names: + kind: MyResource + listKind: MyResourceList + plural: myresources + singular: myresource + scope: Namespaced + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + field1: + type: string +--- +apiVersion: example.com/v1 +kind: MyResource +metadata: + name: my-resource-instance +spec: + field1: "value1" diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt new file mode 100644 index 00000000..eb552f3f --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt @@ -0,0 +1,6 @@ +{"toolset_name":"helm/core","tool_name":"helm_list"} +stdout: +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +robusta default 135 2025-01-06 15:47:11.45987258 +0100 +0100 deployed robusta-0.0.1 0.0.0 + +stderr: \ No newline at end of file diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt new file mode 100644 index 00000000..7c4acf72 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt @@ -0,0 +1,6 @@ +{"toolset_name":"helm/core","tool_name":"helm_list"} + +stdout: +{"customClusterRoleRules":[], "clusterName": "test-cluster"} + +stderr: diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt new file mode 100644 index 00000000..e69e4924 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt @@ -0,0 +1,6 @@ +{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}} +Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1 +stdout: + +stderr: +Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml new file mode 100644 index 00000000..2658f920 --- /dev/null +++ b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml @@ -0,0 +1,10 @@ +user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance" +expected_output: + - To resolve this, update your configuration by adding the necessary permissions + - helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=test-cluster +before_test: kubectl apply -f./custom_resources.yaml +after_test: kubectl delete -f./custom_resources.yaml +evaluation: + answer_relevancy: .5 + faithfulness: .5 + contextual_precision: 0