diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index 35a7fc9eebf6..65a710d2114a 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -52,7 +52,6 @@ Topics: # File: logging-affinity-and-anti-afinity #--- -#--- #Name: Configuring your Logging deployment #Dir: config #Distros: openshift-logging @@ -103,14 +102,15 @@ Topics: # File: cluster-logging-dashboards #- Name: Log visualization with Kibana # File: logging-kibana -#--- -#Name: Logging alerts -#Dir: logging_alerts -#Topics: -#- Name: Default logging alerts -# File: default-logging-alerts -#- Name: Custom logging alerts -# File: custom-logging-alerts +--- +Name: Logging alerts +Dir: logging_alerts +Distros: openshift-logging +Topics: +- Name: Default logging alerts + File: default-logging-alerts +- Name: Custom logging alerts + File: custom-logging-alerts #--- #Name: Performance and reliability tuning #Dir: performance_reliability diff --git a/logging_alerts/custom-logging-alerts.adoc b/logging_alerts/custom-logging-alerts.adoc index d69dc0d11ec4..d3069a92e9ce 100644 --- a/logging_alerts/custom-logging-alerts.adoc +++ b/logging_alerts/custom-logging-alerts.adoc @@ -6,11 +6,11 @@ include::_attributes/common-attributes.adoc[] toc::[] -In logging 5.7 and later versions, users can configure the LokiStack deployment to produce customized alerts and recorded metrics. If you want to use customized link:https://grafana.com/docs/loki/latest/alert/[alerting and recording rules], you must enable the LokiStack ruler component. +You can configure the LokiStack deployment to produce customized alerts and recorded metrics. If you want to use customized link:https://grafana.com/docs/loki/latest/alert/[alerting and recording rules], you must enable the LokiStack ruler component. -LokiStack log-based alerts and recorded metrics are triggered by providing link:https://grafana.com/docs/loki/latest/query/[LogQL] expressions to the ruler component. The {loki-op} manages a ruler that is optimized for the selected LokiStack size, which can be `1x.extra-small`, `1x.small`, or `1x.medium`. +LokiStack log-based alerts and recorded metrics are triggered by providing link:https://grafana.com/docs/loki/latest/query/[LogQL] (Grafana documentation) expressions to the ruler component. -To provide these expressions, you must create an `AlertingRule` custom resource (CR) containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting rules], or a `RecordingRule` CR containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording rules]. +To provide these expressions, you must create an `AlertingRule` custom resource (CR) containing link:https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/[alerting rules], or a `RecordingRule` CR containing Prometheus-compatible link:https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/[recording rules] (Prometheus documentation). Administrators can configure log-based alerts or recorded metrics for `application`, `audit`, or `infrastructure` tenants. Users without administrator permissions can configure log-based alerts or recorded metrics for `application` tenants of the applications that they have access to. diff --git a/logging_alerts/default-logging-alerts.adoc b/logging_alerts/default-logging-alerts.adoc index eab6a7c8f6a2..1a517798b2a6 100644 --- a/logging_alerts/default-logging-alerts.adoc +++ b/logging_alerts/default-logging-alerts.adoc @@ -11,11 +11,15 @@ Logging alerts are installed as part of the {clo} installation. Alerts depend on Default logging alerts are sent to the {ocp-product-title} monitoring stack Alertmanager in the `openshift-monitoring` namespace, unless you have disabled the local Alertmanager instance. // TODO MONITORING REMOVE DEPENDENCY -include::modules/monitoring-accessing-the-alerting-ui.adoc[leveloffset=+1] -include::modules/logging-collector-alerts.adoc[leveloffset=+1] +include::modules/monitoring-accessing-the-alerting-ui.adoc[leveloffset=+1,tag=ADM] +//include::modules/logging-collector-alerts.adoc[leveloffset=+1] include::modules/logging-vector-collector-alerts.adoc[leveloffset=+1] +include::modules/loki-alerts.adoc[leveloffset=+1] + +//// include::modules/logging-fluentd-collector-alerts.adoc[leveloffset=+1] include::modules/cluster-logging-elasticsearch-rules.adoc[leveloffset=+1] +//// [role="_additional-resources"] [id="additional-resources_default-logging-alerts"] diff --git a/logging_alerts/docinfo.xml b/logging_alerts/docinfo.xml new file mode 100644 index 000000000000..8ac823c15473 --- /dev/null +++ b/logging_alerts/docinfo.xml @@ -0,0 +1,12 @@ +Logging alerts +{product-title} +{product-version} +Configuring logging alerts. + + This document provides information about configuring logging alerts. + + + + Red Hat OpenShift Documentation Team + + \ No newline at end of file diff --git a/modules/configuring-logging-loki-ruler.adoc b/modules/configuring-logging-loki-ruler.adoc index 00bfae081851..acdd5296599b 100644 --- a/modules/configuring-logging-loki-ruler.adoc +++ b/modules/configuring-logging-loki-ruler.adoc @@ -1,12 +1,12 @@ // Module included in the following assemblies: // -// * observability/logging/logging_alerts/custom-logging-alerts.adoc +// * logging_alerts/custom-logging-alerts.adoc :_mod-docs-content-type: PROCEDURE [id="configuring-logging-loki-ruler_{context}"] = Configuring the ruler -When the LokiStack ruler component is enabled, users can define a group of link:https://grafana.com/docs/loki/latest/query/[LogQL] expressions that trigger logging alerts or recorded metrics. +When the `LokiStack` ruler component is enabled, users can define a group of link:https://grafana.com/docs/loki/latest/query/[LogQL] (Grafana documentation) expressions that trigger logging alerts or recorded metrics. Administrators can enable the ruler by modifying the `LokiStack` custom resource (CR). @@ -18,7 +18,7 @@ Administrators can enable the ruler by modifying the `LokiStack` custom resource .Procedure -* Enable the ruler by ensuring that the `LokiStack` CR contains the following spec configuration: +* Enable the ruler by ensuring that the `LokiStack` CR has the following spec configuration: + [source,yaml] ---- @@ -30,14 +30,16 @@ metadata: spec: # ... rules: - enabled: true <1> - selector: + enabled: true #<1> + selector: #<2> matchLabels: - openshift.io/: "true" <2> - namespaceSelector: + : "true" #<3> + namespaceSelector: #<4> matchLabels: - openshift.io/: "true" <3> + : "true" #<5> ---- <1> Enable Loki alerting and recording rules in your cluster. -<2> Add a custom label that can be added to namespaces where you want to enable the use of logging alerts and metrics. +<2> Specify the selector for the alerting and recording resources. <3> Add a custom label that can be added to namespaces where you want to enable the use of logging alerts and metrics. +<4> Specify the namespaces in which the alerting and recording rules are defined for the {loki-op}. If undefined, only the rules defined in the same namespace as the `LokiStack` are used. +<5> Add a custom label that can be added to namespaces where you want to enable the use of logging alerts and metrics. diff --git a/modules/logging-collector-alerts.adoc b/modules/logging-collector-alerts.adoc index d9956505d08b..43b7fcc6e9d7 100644 --- a/modules/logging-collector-alerts.adoc +++ b/modules/logging-collector-alerts.adoc @@ -1,12 +1,12 @@ // Module included in the following assemblies: // -// * logging/logging_alerts/default-logging-alerts.adoc +// * logging_alerts/default-logging-alerts.adoc :_content-type: REFERENCE [id="logging-collector-alerts_{context}"] = Logging collector alerts -In logging 5.8 and later versions, the following alerts are generated by the {clo}. You can view these alerts in the {ocp-product-title} web console. +The following alerts are generated by the {clo}. You can view these alerts in the {ocp-product-title} web console. [cols="4", options="header"] |=== diff --git a/modules/logging-enabling-loki-alerts.adoc b/modules/logging-enabling-loki-alerts.adoc index d5536390f8ae..5cbfb0f393d3 100644 --- a/modules/logging-enabling-loki-alerts.adoc +++ b/modules/logging-enabling-loki-alerts.adoc @@ -1,6 +1,6 @@ // Module included in the following assemblies: // -// * observability/logging/logging_alerts/custom-logging-alerts.adoc +// * logging_alerts/custom-logging-alerts.adoc :_mod-docs-content-type: PROCEDURE [id="logging-enabling-loki-alerts_{context}"] @@ -12,14 +12,14 @@ The `AlertingRule` CR contains a set of specifications and webhook validation de * If an `AlertingRule` CR includes an invalid `for` period, it is an invalid alerting rule. * If an `AlertingRule` CR includes an invalid LogQL `expr`, it is an invalid alerting rule. * If an `AlertingRule` CR includes two groups with the same name, it is an invalid alerting rule. -* If none of above applies, an alerting rule is considered valid. +* If none of the above applies, an alerting rule is considered valid. [options="header"] |================================================ | Tenant type | Valid namespaces for `AlertingRule` CRs -| application | | audit | `openshift-logging` -| infrastructure | `openshift-/\*`, `kube-/\*`, `default` +| infrastructure | `openshift-\*`, `kube-*`, `default` +| application | All other namespaces. |================================================ .Prerequisites @@ -38,30 +38,30 @@ The `AlertingRule` CR contains a set of specifications and webhook validation de kind: AlertingRule metadata: name: loki-operator-alerts - namespace: openshift-operators-redhat <1> - labels: <2> - openshift.io/: "true" + namespace: openshift-operators-redhat #<1> + labels: #<2> + openshift.io/cluster-monitoring: "true" spec: - tenantID: "infrastructure" <3> + tenantID: infrastructure #<3> groups: - name: LokiOperatorHighReconciliationError rules: - alert: HighPercentageError - expr: | <4> + expr: | #<4> sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"} |= "error" [1m])) by (job) / sum(rate({kubernetes_namespace_name="openshift-operators-redhat", kubernetes_pod_name=~"loki-operator-controller-manager.*"}[1m])) by (job) > 0.01 for: 10s labels: - severity: critical <5> + severity: critical #<5> annotations: - summary: High Loki Operator Reconciliation Errors <6> - description: High Loki Operator Reconciliation Errors <7> + summary: High Loki Operator Reconciliation Errors #<6> + description: High Loki Operator Reconciliation Errors #<7> ---- <1> The namespace where this `AlertingRule` CR is created must have a label matching the LokiStack `spec.rules.namespaceSelector` definition. <2> The `labels` block must match the LokiStack `spec.rules.selector` definition. -<3> `AlertingRule` CRs for `infrastructure` tenants are only supported in the `openshift-\*`, `kube-\*`, or `default` namespaces. +<3> `AlertingRule` CRs for `infrastructure` tenants are only supported in the `openshift-\*`, `kube-*`, or `default` namespaces. <4> The value for `kubernetes_namespace_name:` must match the value for `metadata.namespace`. <5> The value of this mandatory field must be `critical`, `warning`, or `info`. <6> This field is mandatory. @@ -74,23 +74,23 @@ The `AlertingRule` CR contains a set of specifications and webhook validation de kind: AlertingRule metadata: name: app-user-workload - namespace: app-ns <1> - labels: <2> - openshift.io/: "true" + namespace: app-ns #<1> + labels: #<2> + openshift.io/cluster-monitoring: "true" spec: - tenantID: "application" + tenantID: application groups: - name: AppUserWorkloadHighError rules: - alert: - expr: | <3> - sum(rate({kubernetes_namespace_name="app-ns", kubernetes_pod_name=~"podName.*"} |= "error" [1m])) by (job) + expr: | #<3> + sum(rate({kubernetes_namespace_name="app-ns", kubernetes_pod_name=~"podName.*"} |= "error" [1m])) by (job) for: 10s labels: - severity: critical <4> + severity: critical #<4> annotations: - summary: <5> - description: <6> + summary: This is an example summary. #<5> + description: This is an example description. #<6> ---- <1> The namespace where this `AlertingRule` CR is created must have a label matching the LokiStack `spec.rules.namespaceSelector` definition. <2> The `labels` block must match the LokiStack `spec.rules.selector` definition. diff --git a/modules/logging-vector-collector-alerts.adoc b/modules/logging-vector-collector-alerts.adoc index 4d2c5e8d07eb..90b3689d24cb 100644 --- a/modules/logging-vector-collector-alerts.adoc +++ b/modules/logging-vector-collector-alerts.adoc @@ -1,36 +1,30 @@ // Module included in the following assemblies: // -// * observability/logging/logging_alerts/default-logging-alerts.adoc +// * logging_alerts/default-logging-alerts.adoc :_mod-docs-content-type: REFERENCE [id="logging-vector-collector-alerts_{context}"] -= Vector collector alerts += {clo} alerts -In logging 5.7 and later versions, the following alerts are generated by the Vector collector. You can view these alerts in the {ocp-product-title} web console. +The following alerts are generated by the Vector collector. You can view these alerts in the {ocp-product-title} web console. .Vector collector alerts [cols="2,2,2,1",options="header"] |=== |Alert |Message |Description |Severity -|`CollectorHighErrorRate` -|` of records have resulted in an error by vector .` -|The number of vector output errors is high, by default more than 10 in the previous 15 minutes. -|Warning - |`CollectorNodeDown` |`Prometheus could not scrape vector for more than 10m.` |Vector is reporting that Prometheus could not scrape a specific Vector instance. |Critical -|`CollectorVeryHighErrorRate` -|` of records have resulted in an error by vector .` -|The number of Vector component errors are very high, by default more than 25 in the previous 15 minutes. -|Critical - -|`FluentdQueueLengthIncreasing` -|`In the last 1h, fluentd buffer queue length constantly increased more than 1. Current value is .` -|Fluentd is reporting that the queue size is increasing. +|`DiskBufferUsage` +|`Collectors potentially consuming too much node disk, ` +|Collectors are consuming too much node disk on the host. |Warning +|`CollectorHigh403ForbiddenResponseRate` +|`High rate of "HTTP 403 Forbidden" responses detected for collector in namespace for output