Merge branch 'master' into add_grafana_loki_tool

robusta-dev · Jan 7, 2025 · 6670410 · 6670410
2 parents 8d2ae21 + b3f5961
commit 6670410
Show file tree

Hide file tree

Showing 12 changed files with 208 additions and 8 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -43,6 +43,14 @@ RUN ./kube-lineage --version
 
 RUN curl -sSL -o argocd-linux-amd64 https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-amd64
 
+# Install Helm
+RUN curl https://baltocdn.com/helm/signing.asc | gpg --dearmor -o /usr/share/keyrings/helm.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" \
+    | tee /etc/apt/sources.list.d/helm-stable-debian.list \
+    && apt-get update \
+    && apt-get install -y helm \
+    && rm -rf /var/lib/apt/lists/*
+
 # Set up poetry
 ARG PRIVATE_PACKAGE_REGISTRY="none"
 RUN if [ "${PRIVATE_PACKAGE_REGISTRY}" != "none" ]; then \
@@ -92,10 +100,16 @@ RUN apt-get install -y kubectl
 COPY --from=builder /app/kube-lineage /usr/local/bin
 RUN kube-lineage --version
 
+# Set up ArgoCD
 COPY --from=builder /app/argocd-linux-amd64 /usr/local/bin/argocd
 RUN chmod 555 /usr/local/bin/argocd
 RUN argocd --help
 
+# Set up Helm
+COPY --from=builder /usr/bin/helm /usr/local/bin/helm
+RUN chmod 555 /usr/local/bin/helm
+RUN helm version
+
 ARG AWS_DEFAULT_PROFILE
 ARG AWS_DEFAULT_REGION
 ARG AWS_PROFILE

diff --git a/holmes/core/supabase_dal.py b/holmes/core/supabase_dal.py
@@ -1,4 +1,5 @@
 import base64
+import binascii
 import json
 import logging
 import os
@@ -7,7 +8,11 @@
 from uuid import uuid4
 
 import yaml
-from holmes.core.tool_calling_llm import ResourceInstructionDocument, ResourceInstructions, Instructions
+from holmes.core.tool_calling_llm import (
+    ResourceInstructionDocument,
+    ResourceInstructions,
+    Instructions,
+)
 from holmes.utils.definitions import RobustaConfig
 from postgrest.types import ReturnMethod
 from supabase import create_client
@@ -17,8 +22,14 @@
 from postgrest._sync.request_builder import SyncQueryRequestBuilder
 from postgrest.exceptions import APIError as PGAPIError
 
-from holmes.common.env_vars import (ROBUSTA_CONFIG_PATH, ROBUSTA_ACCOUNT_ID, STORE_URL, STORE_API_KEY, STORE_EMAIL,
-                                    STORE_PASSWORD)
+from holmes.common.env_vars import (
+    ROBUSTA_CONFIG_PATH,
+    ROBUSTA_ACCOUNT_ID,
+    STORE_URL,
+    STORE_API_KEY,
+    STORE_EMAIL,
+    STORE_PASSWORD,
+)
 
 from datetime import datetime, timedelta
 
@@ -83,7 +94,17 @@ def __load_robusta_config() -> Optional[RobustaToken]:
         env_ui_token = os.environ.get("ROBUSTA_UI_TOKEN")
         if env_ui_token:
             # token provided as env var
-            return RobustaToken(**json.loads(base64.b64decode(env_ui_token)))
+            try:
+                decoded = base64.b64decode(env_ui_token)
+                return RobustaToken(**json.loads(decoded))
+            except binascii.Error:
+                raise Exception(
+                    f"binascii.Error encountered. The Robusta UI token is not a valid base64."
+                )
+            except json.JSONDecodeError:
+                raise Exception(
+                    f"json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
+                )
 
         if not os.path.exists(config_file_path):
             logging.info(f"No robusta config in {config_file_path}")
@@ -96,8 +117,31 @@ def __load_robusta_config() -> Optional[RobustaToken]:
             for conf in config.sinks_config:
                 if "robusta_sink" in conf.keys():
                     token = conf["robusta_sink"].get("token")
-                    return RobustaToken(**json.loads(base64.b64decode(token)))
-
+                    if not token:
+                        raise Exception(
+                            f"No token provided in robusta_sink. "
+                            f"Please set a valid Robusta UI token. "
+                            f"See https://docs.robusta.dev/master/configuration/sinks/RobustaUI.html#configuring-the-robusta-ui-sink for instructions."
+                        )
+                    if "{{" in token:
+                        raise ValueError(
+                            f"The token appears to be a templating placeholder (e.g. `{{ env.UI_SINK_TOKEN }}`). "
+                            f"Ensure your Helm chart or environment variables are set correctly. "
+                            f"If you store the token in a secret, you must also pass "
+                            f"the environment variable ROBUSTA_UI_TOKEN to Holmes. "
+                            f"See https://docs.robusta.dev/master/configuration/ai-analysis.html#configuring-holmesgpt-access-to-saas-data for instructions."
+                        )
+                    try:
+                        decoded = base64.b64decode(token)
+                        return RobustaToken(**json.loads(decoded))
+                    except binascii.Error:
+                        raise Exception(
+                            f"binascii.Error encountered. The Robusta UI token is not a valid base64."
+                        )
+                    except json.JSONDecodeError:
+                        raise Exception(
+                            f"json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
+                        )
         return None
 
     def __init_config(self) -> bool:

diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -29,10 +29,44 @@ If investigating Kubernetes problems:
 ** check the application aspects through the logs (kubectl_logs and kubectl_previous_logs) and other relevant tools
 ** look for misconfigured ingresses/services etc
 
+Handling Permission Errors
+If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution:
+1.**Analyze the Error Message**
+ - Identify the missing resource, API group, and verbs from the error details.
+ - Never stop at reporting the error
+ - Proceed with an in-depth investigation.
+2.**Locate the Relevant Helm Release**
+Check if Helm tools are available, if they are available always use Helm commands to help user find the release associated with the Holmes pod:
+ - Run `helm list -A | grep holmes` to identify the release name.
+ - Run `helm get values <RELEASE_NAME> -n <NAMESPACE>` to retrieve details such as `customClusterRoleRules` and `clusterName`.
+If Helm tools are unavailable, skip this step.
+3. **Check for Missing Permissions**
+ - Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
+ - Focus on identifying absent permissions that align with the error message.
+4. **Update the Configuration**
+If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: 
+```
+holmes:
+    customClusterRoleRules:
+      - apiGroups: ["<API_GROUP>"]
+        resources: ["<RESOURCE_1>", "<RESOURCE_2>"]
+        verbs: ["<VERB_1>", "<VERB_2>", "<VERB_3>"]
+```
+After that instruct them to apply the changes with::
+```
+    helm upgrade <RELEASE_NAME> robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
+```
+5. **Fallback Guidelines**
+- If you cannot determine the release or cluster name, use placeholders `<RELEASE_NAME>` and `<YOUR_CLUSTER_NAME>`.
+- While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves.
+Reminder:
+* Always adhere to this process, even if Helm tools are unavailable.
+* Strive for thoroughness and precision, ensuring the issue is fully addressed.
+
 Special cases and how to reply:
 * if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
 * make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
-* as a special case of that, if you try to investigate by running a tool and the tool gives you output that permissions are missing *to run the tool* then say "I tried to investigate but I am missing permissions to run the tool <tool_name>. <details and exact logs of the error message>"
+* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
 * that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
 * Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`.
 * For any question, try to make the answer specific to the user's cluster.

diff --git a/holmes/plugins/prompts/generic_ask_conversation.jinja2 b/holmes/plugins/prompts/generic_ask_conversation.jinja2
@@ -6,10 +6,10 @@ If you have a good and concrete suggestion for how the user can fix something, t
 
 Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
 
-{% include '_global_instructions.jinja2' %}
 
 {% include '_general_instructions.jinja2' %}
 
+
 Style guide:
 * Reply with terse output.
 * Be painfully concise.

diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/custom_resources.yaml
@@ -0,0 +1,32 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: myresources.example.com
+spec:
+  group: example.com
+  names:
+    kind: MyResource
+    listKind: MyResourceList
+    plural: myresources
+    singular: myresource
+  scope: Namespaced
+  versions:
+    - name: v1
+      served: true
+      storage: true
+      schema:
+        openAPIV3Schema:
+          type: object
+          properties:
+            spec:
+              type: object
+              properties:
+                field1:
+                  type: string
+---
+apiVersion: example.com/v1
+kind: MyResource
+metadata:
+  name: my-resource-instance
+spec:
+  field1: "value1"
diff --git a/...llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt b/...llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/kubectl_find_resource.txt
@@ -0,0 +1,6 @@
+{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}}
+Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1
+stdout:
+
+stderr:
+Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope
diff --git a/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/27_permissions_error_no_helm_tools/test_case.yaml
@@ -0,0 +1,10 @@
+user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance"
+expected_output: 
+  - Modify the generated_values.yaml file to include the missing permissions 
+  - helm upgrade <RELEASE_NAME> robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
+before_test: kubectl apply -f./custom_resources.yaml
+after_test: kubectl delete -f./custom_resources.yaml
+evaluation:
+  answer_relevancy: .5
+  faithfulness: .5
+  contextual_precision: 0
diff --git a/...lm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml b/...lm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/custom_resources.yaml
@@ -0,0 +1,32 @@
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  name: myresources.example.com
+spec:
+  group: example.com
+  names:
+    kind: MyResource
+    listKind: MyResourceList
+    plural: myresources
+    singular: myresource
+  scope: Namespaced
+  versions:
+    - name: v1
+      served: true
+      storage: true
+      schema:
+        openAPIV3Schema:
+          type: object
+          properties:
+            spec:
+              type: object
+              properties:
+                field1:
+                  type: string
+---
+apiVersion: example.com/v1
+kind: MyResource
+metadata:
+  name: my-resource-instance
+spec:
+  field1: "value1"
diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_list.txt
@@ -0,0 +1,6 @@
+{"toolset_name":"helm/core","tool_name":"helm_list"}
+stdout:
+NAME   	NAMESPACE	REVISION	UPDATED                                 	STATUS  	CHART        	APP VERSION
+robusta	default  	135     	2025-01-06 15:47:11.45987258 +0100 +0100	deployed	robusta-0.0.1	0.0.0      
+
+stderr:
diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/helm_values.txt
@@ -0,0 +1,6 @@
+{"toolset_name":"helm/core","tool_name":"helm_list"}
+
+stdout:
+{"customClusterRoleRules":[], "clusterName": "test-cluster"}
+
+stderr:
diff --git a/...ixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt b/...ixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/kubectl_find_resource.txt
@@ -0,0 +1,6 @@
+{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}}
+Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1
+stdout:
+
+stderr:
+Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope
diff --git a/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/28_permissions_error_helm_tools_enabled/test_case.yaml
@@ -0,0 +1,10 @@
+user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance"
+expected_output: 
+  - To resolve this, update your configuration by adding the necessary permissions
+  - helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=test-cluster
+before_test: kubectl apply -f./custom_resources.yaml
+after_test: kubectl delete -f./custom_resources.yaml
+evaluation:
+  answer_relevancy: .5
+  faithfulness: .5
+  contextual_precision: 0