Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ SCRIPTS_SRC = $(shell find . \( -name '*.sh' -o -name '*.py' -o -name '*.mk' -o
-not -path './vendor/*' \
-not -path './idl/*' \
-not -path './jaeger-ui/*' \
-not -path './monitoring/jaeger-mixin/vendor/*' \
-type f | \
sort)

Expand Down Expand Up @@ -146,7 +147,7 @@ fmt: $(GOFUMPT)
@./scripts/lint/updateLicense.py $(ALL_SRC) $(SCRIPTS_SRC)

.PHONY: lint
lint: lint-fmt lint-license lint-imports lint-semconv lint-goversion lint-goleak lint-go
lint: lint-fmt lint-license lint-imports lint-semconv lint-goversion lint-goleak lint-go lint-monitoring
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not something we change that often to include this in the main lint target. We can add a workflow that is only executed if files in this dir are changed.


.PHONY: lint-license
lint-license:
Expand Down Expand Up @@ -253,3 +254,7 @@ repro-check:
$(MAKE) clean
$(MAKE) build-all-platforms
shasum -b -a 256 --strict --check ./sha256sum.combined.txt

.PHONY: lint-monitoring
lint-monitoring:
@./scripts/lint/lint-monitoring.sh
2 changes: 1 addition & 1 deletion monitoring/jaeger-mixin/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ This repository contains also a pre-built dashboard for Grafana and alert rules
- [Dashboard](./dashboard-for-grafana.json)
- [Alerts](./prometheus_alerts.yml)

_IMPORTANT_: the metrics that are used by default by the dashboard are compatible with the components deployed as part of the production strategy, where each component is deployed individually. Some metric names differ from the ones used in the all-in-one strategy. Adjust your dashboard to reflect your scenario.


## Background

Expand Down
75 changes: 17 additions & 58 deletions monitoring/jaeger-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
local percentErrs(metric, errSelectors) = '100 * sum(rate(%(metric)s{%(errSelectors)s}[1m])) by (instance, job, namespace) / sum(rate(%(metric)s[1m])) by (instance, job, namespace)' % {
metric: metric,
errSelectors: errSelectors,
};

local percentErrsWithTotal(metric_errs, metric_total) = '100 * sum(rate(%(metric_errs)s[1m])) by (instance, job, namespace) / sum(rate(%(metric_total)s[1m])) by (instance, job, namespace)' % {
metric_errs: metric_errs,
metric_total: metric_total,
local percent(numerator, denominator) = '100 * %(numerator)s / %(denominator)s' % {
numerator: numerator,
denominator: denominator,
};

{
Expand All @@ -14,8 +9,8 @@ local percentErrsWithTotal(metric_errs, metric_total) = '100 * sum(rate(%(metric
{
name: 'jaeger_alerts',
rules: [{
alert: 'JaegerHTTPServerErrs',
expr: percentErrsWithTotal('jaeger_agent_http_server_errors_total', 'jaeger_agent_http_server_total') + '> 1',
alert: 'OtelHttpServerErrors',
expr: '100 * sum(rate(otelcol_http_server_duration_count{http_status_code=~"5.."}[1m])) by (instance, job) / sum(rate(otelcol_http_server_duration_count[1m])) by (instance, job) > 1',
'for': '15m',
labels: {
severity: 'warning',
Expand All @@ -26,87 +21,51 @@ local percentErrsWithTotal(metric_errs, metric_total) = '100 * sum(rate(%(metric
|||,
},
}, {
alert: 'JaegerRPCRequestsErrors',
expr: percentErrs('jaeger_client_jaeger_rpc_http_requests', 'status_code=~"4xx|5xx"') + '> 1',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% RPC HTTP errors.
|||,
},
}, {
alert: 'JaegerClientSpansDropped',
expr: percentErrs('jaeger_reporter_spans', 'result=~"dropped|err"') + '> 1',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
service {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|||,
},
}, {
alert: 'JaegerAgentSpansDropped',
expr: percentErrsWithTotal('jaeger_agent_reporter_batches_failures_total', 'jaeger_agent_reporter_batches_submitted_total') + '> 1',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
agent {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
|||,
},
}, {
alert: 'JaegerCollectorDroppingSpans',
expr: percentErrsWithTotal('jaeger_collector_spans_dropped_total', 'jaeger_collector_spans_received_total') + '> 1',
alert: 'OtelExporterQueueFull',
expr: '100 * otelcol_exporter_queue_size / otelcol_exporter_queue_capacity > 80',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
collector {{ $labels.job }} {{ $labels.instance }} is dropping {{ printf "%.2f" $value }}% spans.
{{ $labels.job }} {{ $labels.instance }} exporter queue is at {{ printf "%.2f" $value }} items (over 80% capacity).
|||,
},
}, {
alert: 'JaegerSamplingUpdateFailing',
expr: percentErrs('jaeger_sampler_queries', 'result="err"') + '> 1',
alert: 'OtelHighMemoryUsage',
expr: 'otelcol_process_memory_rss > 100000000',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating sampling policies.
{{ $labels.job }} {{ $labels.instance }} memory usage is high at {{ humanize $value }} bytes.
|||,
},
}, {
alert: 'JaegerThrottlingUpdateFailing',
expr: percentErrs('jaeger_throttler_updates', 'result="err"') + '> 1',
alert: 'OtelHighCpuUsage',
expr: 'rate(otelcol_process_cpu_seconds[5m]) > 0.8',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% in updating throttling policies.
{{ $labels.job }} {{ $labels.instance }} CPU usage is high ({{ printf "%.2f" $value }} seconds of CPU time in 5m).
|||,
},
}, {
alert: 'JaegerQueryReqsFailing',
expr: percentErrs('jaeger_query_requests_total', 'result="err"') + '> 1',
alert: 'OtelProcessorBatchHighCardinality',
expr: 'otelcol_processor_batch_metadata_cardinality > 1000',
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.instance }} is seeing {{ printf "%.2f" $value }}% query errors on {{ $labels.operation }}.
{{ $labels.job }} {{ $labels.instance }} has high metadata cardinality ({{ printf "%.0f" $value }} combinations).
|||,
},
}],
Expand Down
Loading