Skip to content

Commit

Permalink
Merge branch 'master' into add_grafana_loki_tool
Browse files Browse the repository at this point in the history
  • Loading branch information
nherment authored Jan 7, 2025
2 parents 8d2ae21 + b3f5961 commit 6670410
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 8 deletions.
14 changes: 14 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ RUN ./kube-lineage --version

RUN curl -sSL -o argocd-linux-amd64 https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-amd64

# Install Helm
RUN curl https://baltocdn.com/helm/signing.asc | gpg --dearmor -o /usr/share/keyrings/helm.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/helm.gpg] https://baltocdn.com/helm/stable/debian/ all main" \
| tee /etc/apt/sources.list.d/helm-stable-debian.list \
&& apt-get update \
&& apt-get install -y helm \
&& rm -rf /var/lib/apt/lists/*

# Set up poetry
ARG PRIVATE_PACKAGE_REGISTRY="none"
RUN if [ "${PRIVATE_PACKAGE_REGISTRY}" != "none" ]; then \
Expand Down Expand Up @@ -92,10 +100,16 @@ RUN apt-get install -y kubectl
COPY --from=builder /app/kube-lineage /usr/local/bin
RUN kube-lineage --version

# Set up ArgoCD
COPY --from=builder /app/argocd-linux-amd64 /usr/local/bin/argocd
RUN chmod 555 /usr/local/bin/argocd
RUN argocd --help

# Set up Helm
COPY --from=builder /usr/bin/helm /usr/local/bin/helm
RUN chmod 555 /usr/local/bin/helm
RUN helm version

ARG AWS_DEFAULT_PROFILE
ARG AWS_DEFAULT_REGION
ARG AWS_PROFILE
Expand Down
56 changes: 50 additions & 6 deletions holmes/core/supabase_dal.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import binascii
import json
import logging
import os
Expand All @@ -7,7 +8,11 @@
from uuid import uuid4

import yaml
from holmes.core.tool_calling_llm import ResourceInstructionDocument, ResourceInstructions, Instructions
from holmes.core.tool_calling_llm import (
ResourceInstructionDocument,
ResourceInstructions,
Instructions,
)
from holmes.utils.definitions import RobustaConfig
from postgrest.types import ReturnMethod
from supabase import create_client
Expand All @@ -17,8 +22,14 @@
from postgrest._sync.request_builder import SyncQueryRequestBuilder
from postgrest.exceptions import APIError as PGAPIError

from holmes.common.env_vars import (ROBUSTA_CONFIG_PATH, ROBUSTA_ACCOUNT_ID, STORE_URL, STORE_API_KEY, STORE_EMAIL,
STORE_PASSWORD)
from holmes.common.env_vars import (
ROBUSTA_CONFIG_PATH,
ROBUSTA_ACCOUNT_ID,
STORE_URL,
STORE_API_KEY,
STORE_EMAIL,
STORE_PASSWORD,
)

from datetime import datetime, timedelta

Expand Down Expand Up @@ -83,7 +94,17 @@ def __load_robusta_config() -> Optional[RobustaToken]:
env_ui_token = os.environ.get("ROBUSTA_UI_TOKEN")
if env_ui_token:
# token provided as env var
return RobustaToken(**json.loads(base64.b64decode(env_ui_token)))
try:
decoded = base64.b64decode(env_ui_token)
return RobustaToken(**json.loads(decoded))
except binascii.Error:
raise Exception(
f"binascii.Error encountered. The Robusta UI token is not a valid base64."
)
except json.JSONDecodeError:
raise Exception(
f"json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
)

if not os.path.exists(config_file_path):
logging.info(f"No robusta config in {config_file_path}")
Expand All @@ -96,8 +117,31 @@ def __load_robusta_config() -> Optional[RobustaToken]:
for conf in config.sinks_config:
if "robusta_sink" in conf.keys():
token = conf["robusta_sink"].get("token")
return RobustaToken(**json.loads(base64.b64decode(token)))

if not token:
raise Exception(
f"No token provided in robusta_sink. "
f"Please set a valid Robusta UI token. "
f"See https://docs.robusta.dev/master/configuration/sinks/RobustaUI.html#configuring-the-robusta-ui-sink for instructions."
)
if "{{" in token:
raise ValueError(
f"The token appears to be a templating placeholder (e.g. `{{ env.UI_SINK_TOKEN }}`). "
f"Ensure your Helm chart or environment variables are set correctly. "
f"If you store the token in a secret, you must also pass "
f"the environment variable ROBUSTA_UI_TOKEN to Holmes. "
f"See https://docs.robusta.dev/master/configuration/ai-analysis.html#configuring-holmesgpt-access-to-saas-data for instructions."
)
try:
decoded = base64.b64decode(token)
return RobustaToken(**json.loads(decoded))
except binascii.Error:
raise Exception(
f"binascii.Error encountered. The Robusta UI token is not a valid base64."
)
except json.JSONDecodeError:
raise Exception(
f"json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
)
return None

def __init_config(self) -> bool:
Expand Down
36 changes: 35 additions & 1 deletion holmes/plugins/prompts/_general_instructions.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,44 @@ If investigating Kubernetes problems:
** check the application aspects through the logs (kubectl_logs and kubectl_previous_logs) and other relevant tools
** look for misconfigured ingresses/services etc

Handling Permission Errors
If during the investigation you encounter a permissions error (e.g., `Error from server (Forbidden):`), **ALWAYS** follow these steps to ensure a thorough resolution:
1.**Analyze the Error Message**
- Identify the missing resource, API group, and verbs from the error details.
- Never stop at reporting the error
- Proceed with an in-depth investigation.
2.**Locate the Relevant Helm Release**
Check if Helm tools are available, if they are available always use Helm commands to help user find the release associated with the Holmes pod:
- Run `helm list -A | grep holmes` to identify the release name.
- Run `helm get values <RELEASE_NAME> -n <NAMESPACE>` to retrieve details such as `customClusterRoleRules` and `clusterName`.
If Helm tools are unavailable, skip this step.
3. **Check for Missing Permissions**
- Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
- Focus on identifying absent permissions that align with the error message.
4. **Update the Configuration**
If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
```
holmes:
customClusterRoleRules:
- apiGroups: ["<API_GROUP>"]
resources: ["<RESOURCE_1>", "<RESOURCE_2>"]
verbs: ["<VERB_1>", "<VERB_2>", "<VERB_3>"]
```
After that instruct them to apply the changes with::
```
helm upgrade <RELEASE_NAME> robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
```
5. **Fallback Guidelines**
- If you cannot determine the release or cluster name, use placeholders `<RELEASE_NAME>` and `<YOUR_CLUSTER_NAME>`.
- While you should attempt to retrieve details using Helm commands, do **not** direct the user to execute these commands themselves.
Reminder:
* Always adhere to this process, even if Helm tools are unavailable.
* Strive for thoroughness and precision, ensuring the issue is fully addressed.

Special cases and how to reply:
* if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
* make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
* as a special case of that, if you try to investigate by running a tool and the tool gives you output that permissions are missing *to run the tool* then say "I tried to investigate but I am missing permissions to run the tool <tool_name>. <details and exact logs of the error message>"
* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
* that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
* Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`.
* For any question, try to make the answer specific to the user's cluster.
Expand Down
2 changes: 1 addition & 1 deletion holmes/plugins/prompts/generic_ask_conversation.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ If you have a good and concrete suggestion for how the user can fix something, t

Use conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.

{% include '_global_instructions.jinja2' %}

{% include '_general_instructions.jinja2' %}


Style guide:
* Reply with terse output.
* Be painfully concise.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: myresources.example.com
spec:
group: example.com
names:
kind: MyResource
listKind: MyResourceList
plural: myresources
singular: myresource
scope: Namespaced
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
field1:
type: string
---
apiVersion: example.com/v1
kind: MyResource
metadata:
name: my-resource-instance
spec:
field1: "value1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}}
Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1
stdout:

stderr:
Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance"
expected_output:
- Modify the generated_values.yaml file to include the missing permissions
- helm upgrade <RELEASE_NAME> robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>
before_test: kubectl apply -f./custom_resources.yaml
after_test: kubectl delete -f./custom_resources.yaml
evaluation:
answer_relevancy: .5
faithfulness: .5
contextual_precision: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: myresources.example.com
spec:
group: example.com
names:
kind: MyResource
listKind: MyResourceList
plural: myresources
singular: myresource
scope: Namespaced
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
field1:
type: string
---
apiVersion: example.com/v1
kind: MyResource
metadata:
name: my-resource-instance
spec:
field1: "value1"
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"toolset_name":"helm/core","tool_name":"helm_list"}
stdout:
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
robusta default 135 2025-01-06 15:47:11.45987258 +0100 +0100 deployed robusta-0.0.1 0.0.0

stderr:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"toolset_name":"helm/core","tool_name":"helm_list"}

stdout:
{"customClusterRoleRules":[], "clusterName": "test-cluster"}

stderr:
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{"toolset_name":"kubernetes/core","tool_name":"kubectl_find_resource","match_params":{"kind":"MyResource","keyword":"my-resource-instance"}}
Command `kubectl get -A --show-labels -o wide MyResource | grep my-resource-instance` failed with return code 1
stdout:

stderr:
Error from server (Forbidden): myresources.example.com is forbidden: User "system:serviceaccount:default:robusta-holmes-service-account" cannot list resource "myresources" in API group "example.com" at the cluster scope
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
user_prompt: "is there any kind of these resources on cluster kind: MyResource metadata: name: my-resource-instance"
expected_output:
- To resolve this, update your configuration by adding the necessary permissions
- helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=test-cluster
before_test: kubectl apply -f./custom_resources.yaml
after_test: kubectl delete -f./custom_resources.yaml
evaluation:
answer_relevancy: .5
faithfulness: .5
contextual_precision: 0

0 comments on commit 6670410

Please sign in to comment.