From 3c6d0491a8ed8ffdb3af84ce6f0874570b3247cf Mon Sep 17 00:00:00 2001 From: x19 <100000306+0xNineteen@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:14:29 -0500 Subject: [PATCH] fix(gossip): consistent scope + less alerts (#484) --- metrics/grafana/alerting/alert_rules.yaml | 14 ++---- metrics/grafana/alerting/contact_points.yaml | 2 +- .../grafana/dashboards/gossip_metrics.json | 47 ++++++++++++++++++- src/gossip/service.zig | 33 ++++++------- 4 files changed, 68 insertions(+), 28 deletions(-) diff --git a/metrics/grafana/alerting/alert_rules.yaml b/metrics/grafana/alerting/alert_rules.yaml index 9e64382c2..4c16b2187 100644 --- a/metrics/grafana/alerting/alert_rules.yaml +++ b/metrics/grafana/alerting/alert_rules.yaml @@ -49,12 +49,11 @@ groups: maxDataPoints: 43200 refId: C type: threshold - noDataState: NoData + noDataState: KeepLast execErrState: Error for: 1m annotations: summary: "sig memory warning: using {{ $values.A.Value }}% of RAM" - labels: {} isPaused: false notification_settings: receiver: slack-sig-alerts @@ -137,17 +136,14 @@ groups: type: threshold dashboardUid: jBuN47BVz panelId: 26 - noDataState: NoData + noDataState: KeepLast + for: 1m execErrState: Error - for: 0s annotations: __dashboardUid__: jBuN47BVz __panelId__: "26" - description: "" - runbook_url: "" - summary: "error: [{{ $labels.scope }}]: {{ $labels.message }} " - labels: - "": "" + summary: "error: [{{ $labels.scope }}]: {{ $labels.message }}" + labels: {} isPaused: false notification_settings: receiver: slack-sig-alerts diff --git a/metrics/grafana/alerting/contact_points.yaml b/metrics/grafana/alerting/contact_points.yaml index 0e4056ea6..daa68aae7 100644 --- a/metrics/grafana/alerting/contact_points.yaml +++ b/metrics/grafana/alerting/contact_points.yaml @@ -11,7 +11,7 @@ contactPoints: url: https://${SLACK_WEBHOOK_URL} text: |- {{ range .Alerts.Firing }} - {{ .Annotations.summary }} -- {{ .PanelURL }} + {{ .Annotations.summary }} {{ end }} title: "{{ len .Alerts.Firing }} Alert(s) Firing" disableResolveMessage: false diff --git a/metrics/grafana/dashboards/gossip_metrics.json b/metrics/grafana/dashboards/gossip_metrics.json index c7a139398..0f8d61559 100644 --- a/metrics/grafana/dashboards/gossip_metrics.json +++ b/metrics/grafana/dashboards/gossip_metrics.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2, + "id": 6, "links": [], "panels": [ { @@ -2007,6 +2007,49 @@ ], "title": "Error Service Logs", "type": "logs" + }, + { + "datasource": { + "default": false, + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 31, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P8E80F9AEF21F6940" + }, + "editorMode": "builder", + "expr": "{scope=\"gossip_service\", level=\"warning\"} |= ``", + "queryType": "range", + "refId": "A" + } + ], + "title": "Warning Service Logs", + "type": "logs" } ], "preload": false, @@ -2024,6 +2067,6 @@ "timezone": "", "title": "Gossip Metrics", "uid": "jBuN47BVz", - "version": 5, + "version": 2, "weekStart": "" } diff --git a/src/gossip/service.zig b/src/gossip/service.zig index 3f3e4e1b4..044b4ce47 100644 --- a/src/gossip/service.zig +++ b/src/gossip/service.zig @@ -1,7 +1,6 @@ const std = @import("std"); const network = @import("zig-network"); const sig = @import("../sig.zig"); -const Bloom = @import("../bloom/bloom.zig").Bloom; const bincode = sig.bincode; const socket_utils = sig.net.socket_utils; @@ -15,10 +14,10 @@ const KeyPair = std.crypto.sign.Ed25519.KeyPair; const EndPoint = network.EndPoint; const UdpSocket = network.Socket; +const Bloom = sig.bloom.Bloom; const Pubkey = sig.core.Pubkey; const Hash = sig.core.Hash; const Logger = sig.trace.log.Logger; -const ScopedLogger = sig.trace.log.ScopedLogger; const Packet = sig.net.Packet; const EchoServer = sig.net.echo.Server; const SocketAddr = sig.net.SocketAddr; @@ -169,13 +168,14 @@ pub const GossipService = struct { thread_pool: ThreadPool, // TODO: fix when http server is working // echo_server: EchoServer, - logger: ScopedLogger(LOG_SCOPE), + logger: ScopedLogger, metrics: GossipMetrics, service_manager: ServiceManager, const Self = @This(); pub const LOG_SCOPE = "gossip_service"; + pub const ScopedLogger = sig.trace.log.ScopedLogger(LOG_SCOPE); const Entrypoint = struct { addr: SocketAddr, info: ?ContactInfo = null }; @@ -450,7 +450,7 @@ pub const GossipService = struct { gossip_value_allocator: std.mem.Allocator, packet: Packet, verified_incoming_channel: *Channel(GossipMessageWithEndpoint), - logger: ScopedLogger(@typeName(VerifyMessageEntry)), + logger: ScopedLogger, pub fn callback(self: *VerifyMessageEntry) !void { const packet = self.packet; @@ -460,19 +460,19 @@ pub const GossipService = struct { packet.data[0..packet.size], bincode.Params.standard, ) catch |e| { - self.logger.err().logf("gossip: packet_verify: failed to deserialize: {s}", .{@errorName(e)}); + self.logger.err().logf("packet_verify: failed to deserialize: {s}", .{@errorName(e)}); return; }; message.sanitize() catch |e| { - self.logger.err().logf("gossip: packet_verify: failed to sanitize: {s}", .{@errorName(e)}); + self.logger.err().logf("packet_verify: failed to sanitize: {s}", .{@errorName(e)}); bincode.free(self.gossip_value_allocator, message); return; }; message.verifySignature() catch |e| { self.logger.err().logf( - "gossip: packet_verify: failed to verify signature from {}: {s}", + "packet_verify: failed to verify signature from {}: {s}", .{ packet.addr, @errorName(e) }, ); bincode.free(self.gossip_value_allocator, message); @@ -508,7 +508,7 @@ pub const GossipService = struct { .gossip_value_allocator = self.gossip_value_allocator, .verified_incoming_channel = self.verified_incoming_channel, .packet = undefined, - .logger = self.logger.withScope(@typeName(VerifyMessageEntry)), + .logger = self.logger, }; } @@ -850,7 +850,7 @@ pub const GossipService = struct { var x_timer = sig.time.Timer.start() catch unreachable; const now = getWallclockMs(); const n_pubkeys_dropped = gossip_table.attemptTrim(now, UNIQUE_PUBKEY_CAPACITY) catch |err| err_blk: { - self.logger.warn().logf("gossip_table.attemptTrim failed: {s}", .{@errorName(err)}); + self.logger.err().logf("gossip_table.attemptTrim failed: {s}", .{@errorName(err)}); break :err_blk 0; }; const elapsed = x_timer.read().asMillis(); @@ -1430,7 +1430,7 @@ pub const GossipService = struct { for (tasks) |*task| { packet_loop: for (task.output.items) |output| { self.packet_outgoing_channel.send(output) catch { - self.logger.err().log("failed to send outgoing packet"); + self.logger.err().log("handleBatchPullRequest: failed to send outgoing packet"); break :packet_loop; }; self.metrics.pull_responses_sent.add(1); @@ -1844,10 +1844,11 @@ pub const GossipService = struct { for (self.entrypoints.items) |entrypoint| { if (entrypoint.info) |info| { if (info.shred_version != 0) { - self.logger.info().logf( - "shred version: {} - from entrypoint contact info: {s}", - .{ info.shred_version, entrypoint.addr.toString().constSlice() }, - ); + self.logger.info() + .field("shred_version", info.shred_version) + .field("entrypoint", entrypoint.addr.toString().constSlice()) + .log("shred_version_from_entrypoint"); + self.my_shred_version.store(info.shred_version, .monotonic); self.my_contact_info.shred_version = info.shred_version; return true; @@ -2098,7 +2099,7 @@ pub const GossipMetrics = struct { // logging details _logging_fields: struct { // Scoping to GossipService instead of logging fields struct. - logger: ScopedLogger(GossipService.LOG_SCOPE), + logger: GossipService.ScopedLogger, log_interval_micros: i64 = 10 * std.time.us_per_s, last_log: i64 = 0, last_logged_snapshot: StatsToLog = .{}, @@ -2135,7 +2136,7 @@ pub const GossipMetrics = struct { 5000, 10000, }; - pub fn init(logger: ScopedLogger(GossipService.LOG_SCOPE)) GetMetricError!Self { + pub fn init(logger: GossipService.ScopedLogger) GetMetricError!Self { var self: Self = undefined; const registry = globalRegistry(); std.debug.assert(try registry.initFields(&self) == 1);