From d42e0a8daef5b7dc7f83a57d59b65060c758e425 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 07:09:05 +0000 Subject: [PATCH 1/2] feat: add alert noise suppression with adaptive dedup windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the #1 reason observability tools get abandoned: alert fatigue from high-frequency alerts that engineers learn to ignore. What ships: - AlertNoiseRecord model tracks per-(table, alert_type) firing counts across 1h, 24h, and 7d rolling windows with a composite noise score (0-100) - Adaptive dedup window: score=0→60min, score=25→2×, score=50→4×, score=75→8× (capped at configurable max, default 8h) - Severity trending: compares recent vs older alert severity groups to surface worsening/improving/stable trends - Auto-throttle flag set when count_24h >= auto_throttle_threshold (default 10) - Noise score refreshed after every successful dispatch so the next dedup window reflects current firing rate accurately - AlertLog.severity column added so trend analysis has raw data - GET/POST /alerts/noise API: list all records, summary stats, per-alert history, and POST .../reset to give a clean slate after root-cause fix - noise_suppression config block in kit.yml with documented score formula and all thresholds tunable (enabled, min/max window, throttle threshold) - 26 unit + integration tests covering scoring, trending, adaptive window math, throttle flag, idempotent upsert, and noise-aware is_alert_deduped https://claude.ai/code/session_018pzrXAhbmudF2BCP3xxqVR --- .../003_add_alert_noise_suppression.py | 60 +++ alerts/base.py | 350 +++++++++++++--- backend/main.py | 7 + backend/models.py | 31 ++ backend/routers/alert_noise.py | 186 +++++++++ config/kit.yml | 22 + tests/test_noise_suppression.py | 386 ++++++++++++++++++ 7 files changed, 980 insertions(+), 62 deletions(-) create mode 100644 alembic/versions/003_add_alert_noise_suppression.py create mode 100644 backend/routers/alert_noise.py create mode 100644 tests/test_noise_suppression.py diff --git a/alembic/versions/003_add_alert_noise_suppression.py b/alembic/versions/003_add_alert_noise_suppression.py new file mode 100644 index 0000000..bcc93aa --- /dev/null +++ b/alembic/versions/003_add_alert_noise_suppression.py @@ -0,0 +1,60 @@ +"""Add alert noise suppression — AlertNoiseRecord table + severity column on AlertLog. + +Revision ID: 003_add_alert_noise_suppression +Revises: 81c7c80e0f60 +Create Date: 2026-04-11 +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "003_add_alert_noise_suppression" +down_revision: Union[str, Sequence[str], None] = "81c7c80e0f60" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add severity column to existing alert_logs table + op.add_column( + "alert_logs", + sa.Column("severity", sa.String(length=20), nullable=True), + ) + + # Create the alert_noise_records table + op.create_table( + "alert_noise_records", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("table_name", sa.String(length=255), nullable=False), + sa.Column("alert_type", sa.String(length=50), nullable=False), + sa.Column("count_1h", sa.Integer(), nullable=False, server_default="0"), + sa.Column("count_24h", sa.Integer(), nullable=False, server_default="0"), + sa.Column("count_7d", sa.Integer(), nullable=False, server_default="0"), + sa.Column("noise_score", sa.Float(), nullable=False, server_default="0.0"), + sa.Column("severity_trend", sa.String(length=20), nullable=False, server_default="stable"), + sa.Column("is_throttled", sa.Boolean(), nullable=False, server_default="false"), + sa.Column("last_calculated_at", sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("table_name", "alert_type", name="uq_noise_table_type"), + ) + op.create_index( + "ix_alert_noise_records_table_name", + "alert_noise_records", + ["table_name"], + ) + op.create_index( + "ix_alert_noise_records_alert_type", + "alert_noise_records", + ["alert_type"], + ) + + +def downgrade() -> None: + op.drop_index("ix_alert_noise_records_alert_type", table_name="alert_noise_records") + op.drop_index("ix_alert_noise_records_table_name", table_name="alert_noise_records") + op.drop_table("alert_noise_records") + op.drop_column("alert_logs", "severity") diff --git a/alerts/base.py b/alerts/base.py index 5bfa30e..6c32c9b 100644 --- a/alerts/base.py +++ b/alerts/base.py @@ -1,6 +1,6 @@ """ ObservaKit — Base Alert Dispatcher -Abstract base class and factory for alert dispatchers. +Abstract base class, factory, deduplication, and noise suppression logic. """ from abc import ABC, abstractmethod @@ -58,6 +58,273 @@ def get_alert_dispatcher(channel: str, **kwargs) -> AlertDispatcher: ) +# --------------------------------------------------------------------------- +# Noise suppression helpers +# --------------------------------------------------------------------------- + +_SEVERITY_WEIGHT = {"fail": 2, "warn": 1, "info": 0} + + +def _compute_noise_score(count_1h: int, count_24h: int, count_7d: int) -> float: + """ + Composite noise score in [0, 100]. + + Weights are chosen so that: + - 1 alert/hour → ~30 pts (moderate — watch but don't throttle) + - 3 alerts/hour → ~90 pts (high — significant throttle applied) + - 5+ alerts/hour → 100 pts (maximum throttle) + - Background churn (e.g. 20 alerts over 7 days) adds at most ~10 pts. + """ + score = (count_1h * 20.0) + (count_24h * 2.0) + (count_7d * 0.5) + return round(min(score, 100.0), 2) + + +def _compute_severity_trend(recent_severities: list[str], older_severities: list[str]) -> str: + """ + Compare the average severity weight of two groups of alerts. + Returns 'worsening', 'improving', or 'stable'. + + Returns 'stable' if either group is empty — no baseline means no trend. + """ + if not recent_severities or not older_severities: + return "stable" + + def _avg(sev_list): + return sum(_SEVERITY_WEIGHT.get(s, 0) for s in sev_list) / len(sev_list) + + delta = _avg(recent_severities) - _avg(older_severities) + if delta > 0.3: + return "worsening" + if delta < -0.3: + return "improving" + return "stable" + + +def _get_noise_config(config: dict) -> dict: + """Extract noise_suppression section with safe defaults.""" + return config.get("alerts", {}).get("noise_suppression", {}) + + +def _get_adaptive_dedup_window(noise_score: float, noise_cfg: dict) -> int: + """ + Convert a noise score into a dedup window (minutes). + + The window grows exponentially with the score so that the first doubling + happens around score=25 and the max is reached around score=75+. + + score=0 → min_window (default 60 min) + score=25 → ~2× min_window + score=50 → ~4× min_window + score=75 → ~8× min_window + score=100 → max_window (default 480 min / 8 h) + """ + min_window = int(noise_cfg.get("min_dedup_window_minutes", 60)) + max_window = int(noise_cfg.get("max_dedup_window_minutes", 480)) + + if noise_score <= 0: + return min_window + + multiplier = 2 ** (noise_score / 25.0) + adaptive = int(min_window * multiplier) + return min(adaptive, max_window) + + +def _refresh_noise_record(db, table_name: str, alert_type: str) -> "AlertNoiseRecord | None": + """ + Recalculate and persist the AlertNoiseRecord for (table_name, alert_type). + Creates a new row if none exists. Returns the updated record (or None on error). + """ + import logging + from datetime import datetime, timedelta, timezone + + from backend.models import AlertLog, AlertNoiseRecord + + log = logging.getLogger(__name__) + now = datetime.now(timezone.utc) + + try: + # Count alerts across three windows from the audit log + def _count(since: datetime) -> int: + return ( + db.query(AlertLog) + .filter( + AlertLog.table_name == table_name, + AlertLog.alert_type == alert_type, + AlertLog.success.is_(True), + AlertLog.sent_at >= since, + ) + .count() + ) + + count_1h = _count(now - timedelta(hours=1)) + count_24h = _count(now - timedelta(hours=24)) + count_7d = _count(now - timedelta(days=7)) + noise_score = _compute_noise_score(count_1h, count_24h, count_7d) + + # Severity trend — compare last 5 alerts vs the 5 before those + recent_rows = ( + db.query(AlertLog.severity) + .filter( + AlertLog.table_name == table_name, + AlertLog.alert_type == alert_type, + AlertLog.success.is_(True), + ) + .order_by(AlertLog.sent_at.desc()) + .limit(10) + .all() + ) + severities = [r.severity for r in recent_rows if r.severity] + severity_trend = _compute_severity_trend(severities[:5], severities[5:]) + + # Fetch or create the noise record + record = ( + db.query(AlertNoiseRecord) + .filter( + AlertNoiseRecord.table_name == table_name, + AlertNoiseRecord.alert_type == alert_type, + ) + .first() + ) + if record is None: + record = AlertNoiseRecord(table_name=table_name, alert_type=alert_type) + db.add(record) + + record.count_1h = count_1h + record.count_24h = count_24h + record.count_7d = count_7d + record.noise_score = noise_score + record.severity_trend = severity_trend + record.last_calculated_at = now + + # Determine whether the alert is currently throttled (score above threshold) + from config.loader import load_config + + try: + cfg = load_config("config/kit.yml") + except Exception: + cfg = {} + noise_cfg = _get_noise_config(cfg) + # Compare 24h count directly against the configured threshold so the + # intent of "10 alerts per day triggers throttling" is unambiguous. + throttle_threshold = int(noise_cfg.get("auto_throttle_threshold", 10)) + record.is_throttled = count_24h >= throttle_threshold + + db.commit() + return record + + except Exception as exc: + log.warning(f"Failed to refresh noise record for {table_name}/{alert_type}: {exc}") + db.rollback() + return None + + +# --------------------------------------------------------------------------- +# Public gating functions +# --------------------------------------------------------------------------- + + +def is_alert_suppressed(db, table_name: str) -> bool: + """ + Return True if there is an active CheckSuppression window for table_name. + Shared utility used by all router alert triggers. + """ + from datetime import datetime, timezone + + from backend.models import CheckSuppression + + suppression = ( + db.query(CheckSuppression) + .filter( + CheckSuppression.table_name == table_name, + CheckSuppression.suppressed_until >= datetime.now(timezone.utc), + ) + .first() + ) + if suppression: + import logging + + logging.getLogger(__name__).info( + f"Alert suppressed for {table_name} until {suppression.suppressed_until} " + f"— reason: {suppression.reason}" + ) + return True + return False + + +def is_alert_deduped(db, table_name: str, alert_type: str, window_minutes: int = 60) -> bool: + """ + Return True if an alert of the same (table, type) was already sent within + the adaptive dedup window. + + The window starts at window_minutes (default 60) and grows exponentially + based on the stored noise score for this (table, alert_type) pair, up to + the configured max_dedup_window_minutes (default 480 min / 8 h). + + Noise suppression can be disabled globally via: + alerts: + noise_suppression: + enabled: false + """ + import logging + from datetime import datetime, timedelta, timezone + + from backend.models import AlertLog, AlertNoiseRecord + + log = logging.getLogger(__name__) + + # Load noise suppression config + from config.loader import load_config + + try: + config = load_config("config/kit.yml") + except Exception: + config = {} + noise_cfg = _get_noise_config(config) + noise_enabled = noise_cfg.get("enabled", True) + + effective_window = window_minutes + + if noise_enabled and db and table_name: + # Look up the persisted noise record (don't recalculate here — that + # happens after a successful dispatch so counts stay accurate). + try: + record = ( + db.query(AlertNoiseRecord) + .filter( + AlertNoiseRecord.table_name == table_name, + AlertNoiseRecord.alert_type == alert_type, + ) + .first() + ) + if record is not None: + effective_window = _get_adaptive_dedup_window(record.noise_score, noise_cfg) + except Exception as exc: + log.warning(f"Could not read noise record for dedup window calculation: {exc}") + + cutoff = datetime.now(timezone.utc) - timedelta(minutes=effective_window) + recent = ( + db.query(AlertLog) + .filter( + AlertLog.table_name == table_name, + AlertLog.alert_type == alert_type, + AlertLog.sent_at >= cutoff, + ) + .first() + ) + if recent: + log.info( + f"Skipping duplicate {alert_type} alert for {table_name} " + f"(dedup window: {effective_window} min, last sent: {recent.sent_at})" + ) + return True + return False + + +# --------------------------------------------------------------------------- +# Main dispatch entry point +# --------------------------------------------------------------------------- + + def dispatch_alert( alert_type: str, message: str, @@ -68,11 +335,18 @@ def dispatch_alert( ): """ Dispatch an alert using routing rules from kit.yml. - Uses load_config() so that ${VAR:-default} env vars are properly expanded. + + Gate order: + 1. Manual suppression window (CheckSuppression) + 2. Adaptive deduplication window (noise-aware) + 3. Route to matching channel(s) or default channel + 4. Log to AlertLog and refresh AlertNoiseRecord """ import logging import re + log = logging.getLogger(__name__) + if db and table_name: if is_alert_suppressed(db, table_name): return @@ -115,7 +389,7 @@ def dispatch_alert( dispatched = True used_channel = channel except Exception as e: - logging.getLogger(__name__).error(f"Failed to send alert via {channel}: {e}") + log.error(f"Failed to send alert via {channel}: {e}") if not dispatched: # Fallback to default channel from config @@ -128,82 +402,34 @@ def dispatch_alert( dispatched = True used_channel = default_channel except Exception as e: - logging.getLogger(__name__).error( - f"Failed to send default alert via {default_channel}: {e}" - ) + log.error(f"Failed to send default alert via {default_channel}: {e}") if dispatched and db: from backend.models import AlertLog try: - log = AlertLog( + log_entry = AlertLog( alert_type=alert_type, channel=used_channel, table_name=table_name, message=formatted_message, + severity=severity, success=True, ) - db.add(log) + db.add(log_entry) db.commit() except Exception as e: - logging.getLogger(__name__).warning(f"Failed to insert AlertLog: {e}") + log.warning(f"Failed to insert AlertLog: {e}") db.rollback() + # Refresh noise record so the next dedup window is accurate + if table_name: + _refresh_noise_record(db, table_name, alert_type) -def is_alert_suppressed(db, table_name: str) -> bool: - """ - Return True if there is an active CheckSuppression window for table_name. - Shared utility used by all router alert triggers. - """ - from datetime import datetime, timezone - - from backend.models import CheckSuppression - suppression = ( - db.query(CheckSuppression) - .filter( - CheckSuppression.table_name == table_name, - CheckSuppression.suppressed_until >= datetime.now(timezone.utc), - ) - .first() - ) - if suppression: - import logging - - logging.getLogger(__name__).info( - f"Alert suppressed for {table_name} until {suppression.suppressed_until} " - f"— reason: {suppression.reason}" - ) - return True - return False - - -def is_alert_deduped(db, table_name: str, alert_type: str, window_minutes: int = 60) -> bool: - """ - Return True if an alert of the same (table, type) was already sent within - the last window_minutes. Used to prevent notification floods. - """ - from datetime import datetime, timedelta, timezone - - from backend.models import AlertLog - - recent = ( - db.query(AlertLog) - .filter( - AlertLog.table_name == table_name, - AlertLog.alert_type == alert_type, - AlertLog.sent_at >= datetime.now(timezone.utc) - timedelta(minutes=window_minutes), - ) - .first() - ) - if recent: - import logging - - logging.getLogger(__name__).info( - f"Skipping duplicate {alert_type} alert for {table_name} (last sent {recent.sent_at})" - ) - return True - return False +# --------------------------------------------------------------------------- +# Lineage helper (unchanged) +# --------------------------------------------------------------------------- def get_lineage_impact(table_name: str) -> list[str]: diff --git a/backend/main.py b/backend/main.py index 0f4c19a..c269f97 100644 --- a/backend/main.py +++ b/backend/main.py @@ -19,6 +19,7 @@ from alembic import command from backend.auth import verify_api_key from backend.routers import ( + alert_noise, checks, contracts, distribution, @@ -143,6 +144,12 @@ async def lifespan(app: FastAPI): tags=["Data Contracts"], dependencies=[Depends(verify_api_key)], ) +app.include_router( + alert_noise.router, + prefix="/alerts/noise", + tags=["Alert Noise Suppression"], + dependencies=[Depends(verify_api_key)], +) # ---- Embedded React Dashboard ---- _STATIC_DIR = Path(__file__).parent / "static" diff --git a/backend/models.py b/backend/models.py index 083f158..95f98e1 100644 --- a/backend/models.py +++ b/backend/models.py @@ -17,6 +17,7 @@ Numeric, String, Text, + UniqueConstraint, create_engine, ) from sqlalchemy.orm import declarative_base, relationship, sessionmaker @@ -167,10 +168,40 @@ class AlertLog(Base): channel = Column(String(50), nullable=False) # slack | email table_name = Column(String(255), nullable=True) message = Column(Text, nullable=False) + severity = Column(String(20), nullable=True) # fail | warn | info sent_at = Column(DateTime, default=lambda: datetime.now(timezone.utc), nullable=False) success = Column(Boolean, default=True) +class AlertNoiseRecord(Base): + """ + Per-(table, alert_type) noise tracking for adaptive throttling. + + The noise_score (0–100) reflects how frequently this alert has been firing. + When the score exceeds the configured threshold the deduplication window is + stretched exponentially so noisy alerts are automatically suppressed without + any manual intervention. + """ + + __tablename__ = "alert_noise_records" + __table_args__ = (UniqueConstraint("table_name", "alert_type", name="uq_noise_table_type"),) + + id = Column(Integer, primary_key=True, index=True) + table_name = Column(String(255), nullable=False, index=True) + alert_type = Column(String(50), nullable=False, index=True) + # Rolling counts over three windows (refreshed on every dispatch_alert call) + count_1h = Column(Integer, default=0, nullable=False) + count_24h = Column(Integer, default=0, nullable=False) + count_7d = Column(Integer, default=0, nullable=False) + # Composite noise score 0-100 derived from the counts above + noise_score = Column(Float, default=0.0, nullable=False) + # Severity trend based on the last 10 alerts: stable | worsening | improving + severity_trend = Column(String(20), default="stable", nullable=False) + # Whether the adaptive window is currently longer than the base minimum + is_throttled = Column(Boolean, default=False, nullable=False) + last_calculated_at = Column(DateTime, nullable=True) + + class ColumnProfile(Base): """Stores column-level statistics for data profiling.""" diff --git a/backend/routers/alert_noise.py b/backend/routers/alert_noise.py new file mode 100644 index 0000000..6ec70b8 --- /dev/null +++ b/backend/routers/alert_noise.py @@ -0,0 +1,186 @@ +""" +ObservaKit — Alert Noise Suppression API + +Endpoints for inspecting and managing the noise scores that drive adaptive +alert deduplication. All routes sit under /alerts/noise. +""" + +from datetime import datetime, timezone + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.orm import Session + +from backend.models import AlertLog, AlertNoiseRecord, get_db + +router = APIRouter() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _serialize(record: AlertNoiseRecord) -> dict: + return { + "table_name": record.table_name, + "alert_type": record.alert_type, + "count_1h": record.count_1h, + "count_24h": record.count_24h, + "count_7d": record.count_7d, + "noise_score": record.noise_score, + "severity_trend": record.severity_trend, + "is_throttled": record.is_throttled, + "last_calculated_at": ( + record.last_calculated_at.isoformat() if record.last_calculated_at else None + ), + } + + +# --------------------------------------------------------------------------- +# Routes +# --------------------------------------------------------------------------- + + +@router.get("", summary="List all noise records") +def list_noise_records(db: Session = Depends(get_db)): + """ + Return noise scores for every (table, alert_type) pair that has been + tracked. Records are sorted by noise_score descending so the noisiest + alerts surface first. + """ + records = ( + db.query(AlertNoiseRecord) + .order_by(AlertNoiseRecord.noise_score.desc()) + .all() + ) + return { + "count": len(records), + "records": [_serialize(r) for r in records], + } + + +@router.get("/summary", summary="Noise suppression summary stats") +def noise_summary(db: Session = Depends(get_db)): + """ + High-level summary: total records, how many are throttled, and the + top-5 noisiest alert/table combinations. + """ + total = db.query(AlertNoiseRecord).count() + throttled = db.query(AlertNoiseRecord).filter(AlertNoiseRecord.is_throttled.is_(True)).count() + worsening = ( + db.query(AlertNoiseRecord) + .filter(AlertNoiseRecord.severity_trend == "worsening") + .count() + ) + top5 = ( + db.query(AlertNoiseRecord) + .order_by(AlertNoiseRecord.noise_score.desc()) + .limit(5) + .all() + ) + return { + "total_tracked": total, + "currently_throttled": throttled, + "worsening_trend": worsening, + "top_noisy_alerts": [_serialize(r) for r in top5], + } + + +@router.get("/{table_name}/{alert_type}", summary="Get noise record for a specific alert") +def get_noise_record(table_name: str, alert_type: str, db: Session = Depends(get_db)): + """ + Retrieve the current noise score and trend for a single + (table_name, alert_type) pair. + """ + record = ( + db.query(AlertNoiseRecord) + .filter( + AlertNoiseRecord.table_name == table_name, + AlertNoiseRecord.alert_type == alert_type, + ) + .first() + ) + if record is None: + raise HTTPException( + status_code=404, + detail=f"No noise record found for table='{table_name}' alert_type='{alert_type}'", + ) + return _serialize(record) + + +@router.post("/{table_name}/{alert_type}/reset", summary="Reset noise score for an alert") +def reset_noise_record(table_name: str, alert_type: str, db: Session = Depends(get_db)): + """ + Reset the noise score for a (table_name, alert_type) pair to zero and + clear the throttle flag. + + Use this after investigating and resolving the root cause of a noisy + alert so it gets a clean slate. The historical AlertLog rows are *not* + deleted — only the computed score is zeroed. + """ + record = ( + db.query(AlertNoiseRecord) + .filter( + AlertNoiseRecord.table_name == table_name, + AlertNoiseRecord.alert_type == alert_type, + ) + .first() + ) + if record is None: + raise HTTPException( + status_code=404, + detail=f"No noise record found for table='{table_name}' alert_type='{alert_type}'", + ) + + record.noise_score = 0.0 + record.count_1h = 0 + record.count_24h = 0 + record.count_7d = 0 + record.severity_trend = "stable" + record.is_throttled = False + record.last_calculated_at = datetime.now(timezone.utc) + db.commit() + + return { + "message": f"Noise record reset for table='{table_name}' alert_type='{alert_type}'", + "record": _serialize(record), + } + + +@router.get("/{table_name}/{alert_type}/history", summary="Recent alert history for an alert") +def alert_history( + table_name: str, + alert_type: str, + limit: int = 50, + db: Session = Depends(get_db), +): + """ + Return the most recent AlertLog rows for a (table_name, alert_type) pair. + Useful for understanding the pattern that drove the noise score up. + """ + rows = ( + db.query(AlertLog) + .filter( + AlertLog.table_name == table_name, + AlertLog.alert_type == alert_type, + ) + .order_by(AlertLog.sent_at.desc()) + .limit(min(limit, 200)) + .all() + ) + return { + "table_name": table_name, + "alert_type": alert_type, + "count": len(rows), + "alerts": [ + { + "id": r.id, + "channel": r.channel, + "severity": r.severity, + "sent_at": r.sent_at.isoformat(), + "success": r.success, + "message": r.message, + } + for r in rows + ], + } diff --git a/config/kit.yml b/config/kit.yml index a87787b..2445fb7 100644 --- a/config/kit.yml +++ b/config/kit.yml @@ -110,6 +110,28 @@ contracts: # ---- Alerting ---- alerts: + # ---- Noise Suppression ---- + # Automatically extends the dedup window for alerts that fire too frequently, + # preventing alert fatigue without any manual suppression commands. + # + # How the score works (0–100): + # score = min(100, count_1h × 20 + count_24h × 2 + count_7d × 0.5) + # + # How the adaptive window works: + # window = min(max_dedup_window_minutes, min_dedup_window_minutes × 2^(score/25)) + # score=0 → 60 min (no throttle) + # score=25 → 120 min (2×) + # score=50 → 240 min (4×) + # score=75 → 480 min (8× = max) + noise_suppression: + enabled: true + # Base dedup window when there is no noise (minutes) + min_dedup_window_minutes: 60 + # Hard ceiling on the adaptive window (minutes); default = 8 hours + max_dedup_window_minutes: 480 + # Alert count within 24 h that starts escalating the dedup window + auto_throttle_threshold: 10 + # Routing rules: match alerts to channels and recipients routing: - match: diff --git a/tests/test_noise_suppression.py b/tests/test_noise_suppression.py new file mode 100644 index 0000000..b5e4eda --- /dev/null +++ b/tests/test_noise_suppression.py @@ -0,0 +1,386 @@ +""" +Tests for alert noise suppression — scoring, adaptive dedup windows, and +severity trending. + +Uses the in-memory SQLite db_session fixture from conftest.py so no real +database or HTTP connections are required. +""" + +from datetime import datetime, timedelta, timezone +from unittest.mock import patch + +import pytest + +from alerts.base import ( + _compute_noise_score, + _compute_severity_trend, + _get_adaptive_dedup_window, + _refresh_noise_record, + is_alert_deduped, +) +from backend.models import AlertLog, AlertNoiseRecord + + +# --------------------------------------------------------------------------- +# _compute_noise_score +# --------------------------------------------------------------------------- + + +class TestComputeNoiseScore: + def test_zero_counts_give_zero_score(self): + assert _compute_noise_score(0, 0, 0) == 0.0 + + def test_one_alert_per_hour_is_low_noise(self): + # 1/h, 5/24h, 10/7d → 20 + 10 + 5 = 35 + score = _compute_noise_score(1, 5, 10) + assert score == 35.0 + + def test_high_hourly_rate_hits_cap(self): + # 6/h alone → 120 → capped at 100 + score = _compute_noise_score(6, 0, 0) + assert score == 100.0 + + def test_score_is_capped_at_100(self): + score = _compute_noise_score(100, 100, 100) + assert score == 100.0 + + def test_24h_contribution(self): + # Only 24h count: 10 alerts → 10 × 2 = 20 + score = _compute_noise_score(0, 10, 0) + assert score == 20.0 + + def test_7d_contribution(self): + # Only 7d count: 40 alerts → 40 × 0.5 = 20 + score = _compute_noise_score(0, 0, 40) + assert score == 20.0 + + +# --------------------------------------------------------------------------- +# _compute_severity_trend +# --------------------------------------------------------------------------- + + +class TestComputeSeverityTrend: + def test_equal_groups_are_stable(self): + recent = ["fail", "fail"] + older = ["fail", "fail"] + assert _compute_severity_trend(recent, older) == "stable" + + def test_worsening_when_recent_more_severe(self): + recent = ["fail", "fail", "fail"] + older = ["warn", "warn", "warn"] + assert _compute_severity_trend(recent, older) == "worsening" + + def test_improving_when_recent_less_severe(self): + recent = ["warn", "warn"] + older = ["fail", "fail", "fail"] + assert _compute_severity_trend(recent, older) == "improving" + + def test_empty_recent_is_stable(self): + assert _compute_severity_trend([], []) == "stable" + + def test_empty_older_is_stable(self): + # No older baseline to compare against — cannot determine direction, so stable + assert _compute_severity_trend(["fail"], []) == "stable" + assert _compute_severity_trend([], ["warn"]) == "stable" + + def test_mixed_groups_can_be_stable(self): + recent = ["fail", "warn"] # avg = 1.5 + older = ["fail", "warn"] # avg = 1.5 + assert _compute_severity_trend(recent, older) == "stable" + + +# --------------------------------------------------------------------------- +# _get_adaptive_dedup_window +# --------------------------------------------------------------------------- + + +class TestGetAdaptiveDedupWindow: + _cfg = {"min_dedup_window_minutes": 60, "max_dedup_window_minutes": 480} + + def test_zero_score_returns_min_window(self): + assert _get_adaptive_dedup_window(0.0, self._cfg) == 60 + + def test_score_25_doubles_window(self): + # 2^(25/25) = 2.0 → 60 × 2 = 120 + assert _get_adaptive_dedup_window(25.0, self._cfg) == 120 + + def test_score_50_quadruples_window(self): + # 2^(50/25) = 4.0 → 60 × 4 = 240 + assert _get_adaptive_dedup_window(50.0, self._cfg) == 240 + + def test_score_100_hits_max_window(self): + # 2^(100/25) = 16 → 60 × 16 = 960, capped at 480 + assert _get_adaptive_dedup_window(100.0, self._cfg) == 480 + + def test_custom_min_max_respected(self): + cfg = {"min_dedup_window_minutes": 30, "max_dedup_window_minutes": 120} + # score=25 → 30 × 2 = 60 + assert _get_adaptive_dedup_window(25.0, cfg) == 60 + # score=100 → 30 × 16 = 480, capped at 120 + assert _get_adaptive_dedup_window(100.0, cfg) == 120 + + +# --------------------------------------------------------------------------- +# _refresh_noise_record (integration with in-memory DB) +# --------------------------------------------------------------------------- + + +class TestRefreshNoiseRecord: + @patch("config.loader.load_config") + def test_creates_record_when_none_exists(self, mock_cfg, db_session): + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 10}}} + + record = _refresh_noise_record(db_session, "public.orders", "freshness") + + assert record is not None + assert record.table_name == "public.orders" + assert record.alert_type == "freshness" + assert record.noise_score == 0.0 + assert record.is_throttled is False + + @patch("config.loader.load_config") + def test_counts_recent_alerts_correctly(self, mock_cfg, db_session): + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 10}}} + + now = datetime.now(timezone.utc) + for minutes_ago in [5, 15, 45]: + db_session.add( + AlertLog( + alert_type="freshness", + table_name="public.orders", + channel="slack", + message="stale", + severity="fail", + success=True, + sent_at=now - timedelta(minutes=minutes_ago), + ) + ) + # One old alert outside 24h window + db_session.add( + AlertLog( + alert_type="freshness", + table_name="public.orders", + channel="slack", + message="stale", + severity="fail", + success=True, + sent_at=now - timedelta(hours=30), + ) + ) + db_session.commit() + + record = _refresh_noise_record(db_session, "public.orders", "freshness") + + assert record.count_1h == 3 # all three are within 1h + assert record.count_24h == 3 # fourth is outside 24h + assert record.count_7d == 4 # all four within 7d + + @patch("config.loader.load_config") + def test_throttled_flag_set_above_threshold(self, mock_cfg, db_session): + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 5}}} + + now = datetime.now(timezone.utc) + # 6 alerts in last 24h (above threshold of 5) + for i in range(6): + db_session.add( + AlertLog( + alert_type="volume", + table_name="public.events", + channel="slack", + message="anomaly", + severity="warn", + success=True, + sent_at=now - timedelta(hours=i), + ) + ) + db_session.commit() + + record = _refresh_noise_record(db_session, "public.events", "volume") + assert record.is_throttled is True + + @patch("config.loader.load_config") + def test_not_throttled_below_threshold(self, mock_cfg, db_session): + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 10}}} + + # Only 2 alerts in last 24h — well below threshold + now = datetime.now(timezone.utc) + for i in range(2): + db_session.add( + AlertLog( + alert_type="schema", + table_name="public.users", + channel="slack", + message="drift", + severity="warn", + success=True, + sent_at=now - timedelta(hours=i * 2), + ) + ) + db_session.commit() + + record = _refresh_noise_record(db_session, "public.users", "schema") + assert record.is_throttled is False + + @patch("config.loader.load_config") + def test_severity_trend_worsening_detected(self, mock_cfg, db_session): + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 10}}} + + now = datetime.now(timezone.utc) + # Recent 5: all 'fail' + for i in range(5): + db_session.add( + AlertLog( + alert_type="quality", + table_name="public.payments", + channel="slack", + message="check failed", + severity="fail", + success=True, + sent_at=now - timedelta(minutes=i * 10), + ) + ) + # Older 5: all 'warn' + for i in range(5): + db_session.add( + AlertLog( + alert_type="quality", + table_name="public.payments", + channel="slack", + message="check warned", + severity="warn", + success=True, + sent_at=now - timedelta(hours=2 + i), + ) + ) + db_session.commit() + + record = _refresh_noise_record(db_session, "public.payments", "quality") + assert record.severity_trend == "worsening" + + @patch("config.loader.load_config") + def test_idempotent_upsert(self, mock_cfg, db_session): + """Calling _refresh_noise_record twice should not create duplicate rows.""" + mock_cfg.return_value = {"alerts": {"noise_suppression": {"auto_throttle_threshold": 10}}} + + _refresh_noise_record(db_session, "public.orders", "freshness") + _refresh_noise_record(db_session, "public.orders", "freshness") + + count = ( + db_session.query(AlertNoiseRecord) + .filter_by(table_name="public.orders", alert_type="freshness") + .count() + ) + assert count == 1 + + +# --------------------------------------------------------------------------- +# is_alert_deduped with noise-aware window (integration) +# --------------------------------------------------------------------------- + + +class TestIsAlertDedupedWithNoise: + @patch("config.loader.load_config") + def test_no_noise_uses_base_window(self, mock_cfg, db_session): + """With no noise record, the default 60-min window is applied.""" + mock_cfg.return_value = { + "alerts": { + "noise_suppression": { + "enabled": True, + "min_dedup_window_minutes": 60, + "max_dedup_window_minutes": 480, + } + } + } + # Alert sent 30 minutes ago — within 60-min base window + db_session.add( + AlertLog( + alert_type="freshness", + table_name="public.orders", + channel="slack", + message="stale", + sent_at=datetime.now(timezone.utc) - timedelta(minutes=30), + ) + ) + db_session.commit() + + assert is_alert_deduped(db_session, "public.orders", "freshness") is True + + @patch("config.loader.load_config") + def test_high_noise_extends_window(self, mock_cfg, db_session): + """A high noise score (≥25) should extend the dedup window beyond 60 min.""" + mock_cfg.return_value = { + "alerts": { + "noise_suppression": { + "enabled": True, + "min_dedup_window_minutes": 60, + "max_dedup_window_minutes": 480, + } + } + } + # Pre-populate a noise record with a high score + noise = AlertNoiseRecord( + table_name="public.orders", + alert_type="volume", + count_1h=3, + count_24h=15, + count_7d=50, + noise_score=75.0, # → window = min(480, 60 × 2^3) = min(480, 480) = 480 min + severity_trend="worsening", + is_throttled=True, + ) + db_session.add(noise) + + # Alert sent 90 minutes ago — outside normal 60-min window but inside 480-min window + db_session.add( + AlertLog( + alert_type="volume", + table_name="public.orders", + channel="slack", + message="anomaly", + sent_at=datetime.now(timezone.utc) - timedelta(minutes=90), + ) + ) + db_session.commit() + + # Should be deduped because adaptive window is 480 min + assert is_alert_deduped(db_session, "public.orders", "volume") is True + + @patch("config.loader.load_config") + def test_noise_suppression_disabled_uses_passed_window(self, mock_cfg, db_session): + """When noise_suppression.enabled=false, the passed window_minutes is used directly.""" + mock_cfg.return_value = { + "alerts": { + "noise_suppression": { + "enabled": False, + "min_dedup_window_minutes": 60, + } + } + } + # High noise record exists but suppression is disabled + noise = AlertNoiseRecord( + table_name="public.orders", + alert_type="schema", + noise_score=90.0, + is_throttled=True, + count_1h=5, + count_24h=20, + count_7d=60, + ) + db_session.add(noise) + + # Alert sent 90 minutes ago — would be inside adaptive window (480 min) if enabled, + # but since suppression is disabled it falls outside the passed window_minutes=60. + db_session.add( + AlertLog( + alert_type="schema", + table_name="public.orders", + channel="slack", + message="drift", + sent_at=datetime.now(timezone.utc) - timedelta(minutes=90), + ) + ) + db_session.commit() + + # Noise suppression off → use window_minutes=60 → 90 min ago is NOT deduped + assert is_alert_deduped(db_session, "public.orders", "schema", window_minutes=60) is False From 8bd267b6ae646456151268d73023957822619940 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 11 Apr 2026 07:33:22 +0000 Subject: [PATCH 2/2] fix: resolve ruff lint errors in noise suppression code - Remove forward-reference return type annotation that referenced AlertNoiseRecord before its import (F821) - Remove unused pytest import from test file (F401) - Sort imports to satisfy isort order (I001) https://claude.ai/code/session_018pzrXAhbmudF2BCP3xxqVR --- alerts/base.py | 2 +- tests/test_noise_suppression.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/alerts/base.py b/alerts/base.py index 6c32c9b..e1c6a60 100644 --- a/alerts/base.py +++ b/alerts/base.py @@ -129,7 +129,7 @@ def _get_adaptive_dedup_window(noise_score: float, noise_cfg: dict) -> int: return min(adaptive, max_window) -def _refresh_noise_record(db, table_name: str, alert_type: str) -> "AlertNoiseRecord | None": +def _refresh_noise_record(db, table_name: str, alert_type: str): """ Recalculate and persist the AlertNoiseRecord for (table_name, alert_type). Creates a new row if none exists. Returns the updated record (or None on error). diff --git a/tests/test_noise_suppression.py b/tests/test_noise_suppression.py index b5e4eda..05cc546 100644 --- a/tests/test_noise_suppression.py +++ b/tests/test_noise_suppression.py @@ -9,8 +9,6 @@ from datetime import datetime, timedelta, timezone from unittest.mock import patch -import pytest - from alerts.base import ( _compute_noise_score, _compute_severity_trend, @@ -20,7 +18,6 @@ ) from backend.models import AlertLog, AlertNoiseRecord - # --------------------------------------------------------------------------- # _compute_noise_score # ---------------------------------------------------------------------------