diff --git a/.env.example b/.env.example index 0d10d4a..819682e 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,13 @@ # DEFAULT local ENV_NAME=development +# DEPLOYMENT MODE +# Controls TLS enforcement for database connections: +# - saas: TLS MANDATORY for all DB connections (hard-fail at startup without TLS) +# - byoc: TLS recommended but not hard-enforced (customer-hosted) +# - local: TLS optional (developer workstation, default if unset) +DEPLOYMENT_MODE=local + # APP SERVER_PORT=4021 VERSION=v1.0.0 diff --git a/api/docs.go b/api/docs.go index 7a5253c..007ef68 100644 --- a/api/docs.go +++ b/api/docs.go @@ -16,55 +16,12 @@ const docTemplate = `{ "host": "{{.Host}}", "basePath": "{{.BasePath}}", "paths": { - "/health": { + "/readyz": { "get": { - "description": "Returns detailed health information including uptime and version", - "produces": [ + "description": "Returns dependency health status following canonical contract", + "consumes": [ "application/json" ], - "tags": [ - "Health" - ], - "summary": "Combined health check", - "responses": { - "200": { - "description": "OK", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - }, - "503": { - "description": "Service Unavailable", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - } - } - } - }, - "/health/live": { - "get": { - "description": "Returns 200 OK if the application process is running", - "produces": [ - "application/json" - ], - "tags": [ - "Health" - ], - "summary": "Kubernetes liveness probe", - "responses": { - "200": { - "description": "OK", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - } - } - } - }, - "/health/ready": { - "get": { - "description": "Returns 200 OK if the application is ready to serve traffic", "produces": [ "application/json" ], @@ -74,15 +31,15 @@ const docTemplate = `{ "summary": "Kubernetes readiness probe", "responses": { "200": { - "description": "OK", + "description": "Service is healthy - all dependencies up/skipped/n-a", "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" + "$ref": "#/definitions/internal_adapters_http_in_readyz.Response" } }, "503": { - "description": "Service Unavailable", + "description": "Service is unhealthy - one or more dependencies down/degraded", "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" + "$ref": "#/definitions/internal_adapters_http_in_readyz.Response" } } } @@ -3703,33 +3660,49 @@ const docTemplate = `{ } } }, - "internal_adapters_http_in_health.CheckResult": { + "internal_adapters_http_in_readyz.DependencyCheck": { "type": "object", "properties": { - "message": { + "breaker_state": { + "description": "when dep is breaker-wrapped", + "type": "string" + }, + "error": { + "description": "when status is down or degraded", + "type": "string" + }, + "latency_ms": { + "description": "when status is up or degraded", + "type": "integer" + }, + "reason": { + "description": "when status is skipped or n/a", "type": "string" }, "status": { + "description": "up/down/degraded/skipped/n/a (closed set)", "type": "string" + }, + "tls": { + "description": "when dependency supports TLS", + "type": "boolean" } } }, - "internal_adapters_http_in_health.HealthResponse": { + "internal_adapters_http_in_readyz.Response": { "type": "object", "properties": { "checks": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/internal_adapters_http_in_health.CheckResult" + "$ref": "#/definitions/internal_adapters_http_in_readyz.DependencyCheck" } }, - "status": { - "type": "string" - }, - "timestamp": { + "deployment_mode": { "type": "string" }, - "uptime": { + "status": { + "description": "\"healthy\" or \"unhealthy\"", "type": "string" }, "version": { diff --git a/api/docs_test.go b/api/docs_test.go index 45657b5..1aba051 100644 --- a/api/docs_test.go +++ b/api/docs_test.go @@ -101,7 +101,7 @@ func TestDocTemplate(t *testing.T) { "/v1/workflows/{id}", "/v1/catalog/executors", "/v1/catalog/triggers", - "/health", + // NOTE: /health and /readyz are infrastructure routes excluded from OpenAPI spec }, }, { diff --git a/api/openapi.yaml b/api/openapi.yaml index 3deaee2..982382e 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -8,54 +8,22 @@ info: servers: - url: //localhost:4021/ paths: - /health: + /readyz: get: - description: Returns detailed health information including uptime and version + description: Returns dependency health status following canonical contract responses: "200": content: application/json: schema: - $ref: '#/components/schemas/internal_adapters_http_in_health.HealthResponse' - description: OK - "503": - content: - application/json: - schema: - $ref: '#/components/schemas/internal_adapters_http_in_health.HealthResponse' - description: Service Unavailable - summary: Combined health check - tags: - - Health - /health/live: - get: - description: Returns 200 OK if the application process is running - responses: - "200": - content: - application/json: - schema: - $ref: '#/components/schemas/internal_adapters_http_in_health.HealthResponse' - description: OK - summary: Kubernetes liveness probe - tags: - - Health - /health/ready: - get: - description: Returns 200 OK if the application is ready to serve traffic - responses: - "200": - content: - application/json: - schema: - $ref: '#/components/schemas/internal_adapters_http_in_health.HealthResponse' - description: OK + $ref: '#/components/schemas/internal_adapters_http_in_readyz.Response' + description: Service is healthy - all dependencies up/skipped/n-a "503": content: application/json: schema: - $ref: '#/components/schemas/internal_adapters_http_in_health.HealthResponse' - description: Service Unavailable + $ref: '#/components/schemas/internal_adapters_http_in_readyz.Response' + description: Service is unhealthy - one or more dependencies down/degraded summary: Kubernetes readiness probe tags: - Health @@ -3386,36 +3354,56 @@ components: valid: type: boolean type: object - internal_adapters_http_in_health.CheckResult: + internal_adapters_http_in_readyz.DependencyCheck: example: - message: message + reason: reason + tls: true + error: error + breaker_state: breaker_state + latency_ms: 0 status: status properties: - message: + breaker_state: + description: when dep is breaker-wrapped + type: string + error: + description: when status is down or degraded + type: string + latency_ms: + description: when status is up or degraded + type: integer + reason: + description: when status is skipped or n/a type: string status: + description: up/down/degraded/skipped/n/a (closed set) type: string + tls: + description: when dependency supports TLS + type: boolean type: object - internal_adapters_http_in_health.HealthResponse: + internal_adapters_http_in_readyz.Response: example: checks: key: - message: message + reason: reason + tls: true + error: error + breaker_state: breaker_state + latency_ms: 0 status: status version: version + deployment_mode: deployment_mode status: status - timestamp: timestamp - uptime: uptime properties: checks: additionalProperties: - $ref: '#/components/schemas/internal_adapters_http_in_health.CheckResult' + $ref: '#/components/schemas/internal_adapters_http_in_readyz.DependencyCheck' type: object - status: - type: string - timestamp: + deployment_mode: type: string - uptime: + status: + description: '"healthy" or "unhealthy"' type: string version: type: string diff --git a/api/swagger.json b/api/swagger.json index feba12b..b7c9cbf 100644 --- a/api/swagger.json +++ b/api/swagger.json @@ -10,55 +10,12 @@ "host": "localhost:4021", "basePath": "/", "paths": { - "/health": { + "/readyz": { "get": { - "description": "Returns detailed health information including uptime and version", - "produces": [ + "description": "Returns dependency health status following canonical contract", + "consumes": [ "application/json" ], - "tags": [ - "Health" - ], - "summary": "Combined health check", - "responses": { - "200": { - "description": "OK", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - }, - "503": { - "description": "Service Unavailable", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - } - } - } - }, - "/health/live": { - "get": { - "description": "Returns 200 OK if the application process is running", - "produces": [ - "application/json" - ], - "tags": [ - "Health" - ], - "summary": "Kubernetes liveness probe", - "responses": { - "200": { - "description": "OK", - "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" - } - } - } - } - }, - "/health/ready": { - "get": { - "description": "Returns 200 OK if the application is ready to serve traffic", "produces": [ "application/json" ], @@ -68,15 +25,15 @@ "summary": "Kubernetes readiness probe", "responses": { "200": { - "description": "OK", + "description": "Service is healthy - all dependencies up/skipped/n-a", "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" + "$ref": "#/definitions/internal_adapters_http_in_readyz.Response" } }, "503": { - "description": "Service Unavailable", + "description": "Service is unhealthy - one or more dependencies down/degraded", "schema": { - "$ref": "#/definitions/internal_adapters_http_in_health.HealthResponse" + "$ref": "#/definitions/internal_adapters_http_in_readyz.Response" } } } @@ -3697,33 +3654,49 @@ } } }, - "internal_adapters_http_in_health.CheckResult": { + "internal_adapters_http_in_readyz.DependencyCheck": { "type": "object", "properties": { - "message": { + "breaker_state": { + "description": "when dep is breaker-wrapped", + "type": "string" + }, + "error": { + "description": "when status is down or degraded", + "type": "string" + }, + "latency_ms": { + "description": "when status is up or degraded", + "type": "integer" + }, + "reason": { + "description": "when status is skipped or n/a", "type": "string" }, "status": { + "description": "up/down/degraded/skipped/n/a (closed set)", "type": "string" + }, + "tls": { + "description": "when dependency supports TLS", + "type": "boolean" } } }, - "internal_adapters_http_in_health.HealthResponse": { + "internal_adapters_http_in_readyz.Response": { "type": "object", "properties": { "checks": { "type": "object", "additionalProperties": { - "$ref": "#/definitions/internal_adapters_http_in_health.CheckResult" + "$ref": "#/definitions/internal_adapters_http_in_readyz.DependencyCheck" } }, - "status": { - "type": "string" - }, - "timestamp": { + "deployment_mode": { "type": "string" }, - "uptime": { + "status": { + "description": "\"healthy\" or \"unhealthy\"", "type": "string" }, "version": { diff --git a/api/swagger.yaml b/api/swagger.yaml index 4b2e6ef..ad96ccd 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -818,24 +818,37 @@ definitions: valid: type: boolean type: object - internal_adapters_http_in_health.CheckResult: + internal_adapters_http_in_readyz.DependencyCheck: properties: - message: + breaker_state: + description: when dep is breaker-wrapped + type: string + error: + description: when status is down or degraded + type: string + latency_ms: + description: when status is up or degraded + type: integer + reason: + description: when status is skipped or n/a type: string status: + description: up/down/degraded/skipped/n/a (closed set) type: string + tls: + description: when dependency supports TLS + type: boolean type: object - internal_adapters_http_in_health.HealthResponse: + internal_adapters_http_in_readyz.Response: properties: checks: additionalProperties: - $ref: '#/definitions/internal_adapters_http_in_health.CheckResult' + $ref: '#/definitions/internal_adapters_http_in_readyz.DependencyCheck' type: object - status: - type: string - timestamp: + deployment_mode: type: string - uptime: + status: + description: '"healthy" or "unhealthy"' type: string version: type: string @@ -848,50 +861,22 @@ info: title: Flowker API version: 1.0.0 paths: - /health: + /readyz: get: - description: Returns detailed health information including uptime and version - produces: - - application/json - responses: - "200": - description: OK - schema: - $ref: '#/definitions/internal_adapters_http_in_health.HealthResponse' - "503": - description: Service Unavailable - schema: - $ref: '#/definitions/internal_adapters_http_in_health.HealthResponse' - summary: Combined health check - tags: - - Health - /health/live: - get: - description: Returns 200 OK if the application process is running - produces: + consumes: - application/json - responses: - "200": - description: OK - schema: - $ref: '#/definitions/internal_adapters_http_in_health.HealthResponse' - summary: Kubernetes liveness probe - tags: - - Health - /health/ready: - get: - description: Returns 200 OK if the application is ready to serve traffic + description: Returns dependency health status following canonical contract produces: - application/json responses: "200": - description: OK + description: Service is healthy - all dependencies up/skipped/n-a schema: - $ref: '#/definitions/internal_adapters_http_in_health.HealthResponse' + $ref: '#/definitions/internal_adapters_http_in_readyz.Response' "503": - description: Service Unavailable + description: Service is unhealthy - one or more dependencies down/degraded schema: - $ref: '#/definitions/internal_adapters_http_in_health.HealthResponse' + $ref: '#/definitions/internal_adapters_http_in_readyz.Response' summary: Kubernetes readiness probe tags: - Health diff --git a/docs/PROJECT_RULES.md b/docs/PROJECT_RULES.md index a17bd17..fdd42ed 100644 --- a/docs/PROJECT_RULES.md +++ b/docs/PROJECT_RULES.md @@ -72,12 +72,12 @@ All Flowker code **MUST** use `lib-commons/v4` as the foundation library. This i ```go import ( - libCommons "github.com/LerianStudio/lib-commons/v5/commons" - libZap "github.com/LerianStudio/lib-commons/v5/commons/zap" // Logger initialization (bootstrap only) - libLog "github.com/LerianStudio/lib-commons/v5/commons/log" // Logger interface (services, routes, handlers) - libOtel "github.com/LerianStudio/lib-commons/v5/commons/opentelemetry" - libHTTP "github.com/LerianStudio/lib-commons/v5/commons/net/http" - libMongo "github.com/LerianStudio/lib-commons/v5/commons/mongo" + libCommons "github.com/LerianStudio/lib-commons/v4/commons" + libZap "github.com/LerianStudio/lib-commons/v4/commons/zap" // Logger initialization (bootstrap only) + libLog "github.com/LerianStudio/lib-commons/v4/commons/log" // Logger interface (services, routes, handlers) + libOtel "github.com/LerianStudio/lib-commons/v4/commons/opentelemetry" + libHTTP "github.com/LerianStudio/lib-commons/v4/commons/net/http" + libMongo "github.com/LerianStudio/lib-commons/v4/commons/mongo" ) ``` @@ -396,8 +396,8 @@ All services **MUST** integrate OpenTelemetry using lib-commons. ```go import ( - libCommons "github.com/LerianStudio/lib-commons/v5/commons" - libOtel "github.com/LerianStudio/lib-commons/v5/commons/opentelemetry" + libCommons "github.com/LerianStudio/lib-commons/v4/commons" + libOtel "github.com/LerianStudio/lib-commons/v4/commons/opentelemetry" "go.opentelemetry.io/otel/trace" // OK: Only for trace.Span type ) @@ -516,7 +516,7 @@ libHTTP.WithError(c, err) ```go import ( - libHTTP "github.com/LerianStudio/lib-commons/v5/commons/net/http" + libHTTP "github.com/LerianStudio/lib-commons/v4/commons/net/http" "github.com/gofiber/fiber/v2" ) @@ -1312,8 +1312,8 @@ All logging **MUST** use `logger.Log(ctx, level, message, fields...)` with typed ```go import ( - libCommons "github.com/LerianStudio/lib-commons/v5/commons" - libLog "github.com/LerianStudio/lib-commons/v5/commons/log" + libCommons "github.com/LerianStudio/lib-commons/v4/commons" + libLog "github.com/LerianStudio/lib-commons/v4/commons/log" ) logger, _, _, _ := libCommons.NewTrackingFromContext(ctx) diff --git a/docs/readyz-guide.md b/docs/readyz-guide.md new file mode 100644 index 0000000..5b1e10a --- /dev/null +++ b/docs/readyz-guide.md @@ -0,0 +1,266 @@ +# Flowker Readyz Activation Guide + +## Overview + +Flowker implements the canonical `/readyz` readiness probe that checks **MongoDB** and **PostgreSQL** (audit database) connectivity. The endpoint returns `200 OK` when all dependencies are reachable and `503 Service Unavailable` when any dependency is down. This enables Kubernetes to make accurate routing decisions and prevents traffic from reaching pods with unreachable databases. + +**Scope fence:** `/readyz` is an infrastructure probe. It does NOT include synthetic business-logic checks, certificate validity validation, or performance SLIs. + +--- + +## Endpoints + +| Endpoint | Purpose | K8s Probe | Auth Required | +|----------|---------|-----------|---------------| +| `/readyz` | Readiness probe — checks all dependencies | `readinessProbe` | No | +| `/health` | Liveness probe — gated by startup self-probe | `livenessProbe` | No | +| `/metrics` | Prometheus scrape — includes readyz metrics | N/A | No | + +Both `/readyz` and `/health` are mounted **before** authentication middleware, allowing Kubernetes probes to function without credentials. + +--- + +## Environment Variables + +| Variable | Description | Default | Required | +|----------|-------------|---------|----------| +| `DEPLOYMENT_MODE` | `saas` / `byoc` / `local` — SaaS enforces TLS on all DBs | `local` | No | +| `VERSION` | Service version (injected via ldflags or env) | `dev` | No | +| `MONGO_URI` | MongoDB connection string | `mongodb://localhost:27017/flowker` | No | +| `AUDIT_DB_HOST` | PostgreSQL audit database host | `localhost` | No | +| `AUDIT_DB_PORT` | PostgreSQL audit database port | `5432` | No | +| `AUDIT_DB_USER` | PostgreSQL audit database user | `flowker_audit` | No | +| `AUDIT_DB_PASSWORD` | PostgreSQL audit database password | `flowker_audit` | No | +| `AUDIT_DB_NAME` | PostgreSQL audit database name | `flowker_audit` | No | +| `AUDIT_DB_SSL_MODE` | PostgreSQL SSL mode (`disable`, `require`, etc.) | `disable` | No | + +--- + +## Kubernetes Probe Configuration + +Copy-paste into your Deployment spec: + +```yaml +readinessProbe: + httpGet: + path: /readyz + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 2 + +livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 3 +``` + +**Note:** Drain grace period is 12 seconds (default), calibrated for `periodSeconds=5 * failureThreshold=2 + buffer`. + +--- + +## Response Contract + +### `/readyz` Response (200 OK) + +```json +{ + "status": "healthy", + "checks": { + "mongodb": { + "status": "up", + "latency_ms": 3, + "tls": true + }, + "postgresql": { + "status": "up", + "latency_ms": 2, + "tls": false + } + }, + "version": "1.0.0", + "deployment_mode": "local" +} +``` + +### `/readyz` Response (503 Service Unavailable) + +```json +{ + "status": "unhealthy", + "checks": { + "mongodb": { + "status": "down", + "error": "context deadline exceeded" + }, + "postgresql": { + "status": "up", + "latency_ms": 2, + "tls": false + } + }, + "version": "1.0.0", + "deployment_mode": "local" +} +``` + +--- + +## Status Vocabulary + +| Status | Meaning | Aggregation Impact | +|--------|---------|-------------------| +| `up` | Dependency reachable, check passed | Healthy | +| `down` | Dependency unreachable or check failed | **Unhealthy → 503** | +| `degraded` | Circuit breaker half-open OR partial failure | **Unhealthy → 503** | +| `skipped` | Optional dependency explicitly disabled | Healthy (ignored) | +| `n/a` | Not applicable in current mode | Healthy (ignored) | + +**Aggregation rule:** Top-level `status` is `"healthy"` **if and only if** every check is `up`, `skipped`, or `n/a`. ANY `down` or `degraded` → HTTP 503. + +--- + +## Metrics Reference + +All metrics are emitted on every `/readyz` request and at startup self-probe. + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `readyz_check_duration_ms` | Histogram | `dep`, `status` | Duration of dependency checks in milliseconds | +| `readyz_check_status` | Counter | `dep`, `status` | Count of check outcomes per dependency | +| `selfprobe_result` | Gauge | `dep` | Last self-probe result (1=up, 0=down) | + +**Histogram buckets (ms):** `[1, 5, 10, 25, 50, 100, 250, 500, 1000, 2000, 5000]` + +### Example Prometheus Queries + +```promql +# Dependency check latency P99 (aggregated across instances) +histogram_quantile(0.99, sum by (le) (rate(readyz_check_duration_ms_bucket[5m]))) + +# Dependency check latency P99 by dependency +histogram_quantile(0.99, sum by (dep, le) (rate(readyz_check_duration_ms_bucket[5m]))) + +# Failure rate by dependency +rate(readyz_check_status{status="down"}[5m]) + +# Current self-probe state per dependency +selfprobe_result{dep="mongodb"} +selfprobe_result{dep="postgresql"} + +# Detect any failing dependency (min=0 means at least one dep is down) +min(selfprobe_result) +``` + +--- + +## Operational Runbook + +### `/readyz` returning 503 + +1. Check which dependency is `down` or `degraded` in the response body +2. Inspect the `error` field for the failing dependency +3. Verify network connectivity to the database +4. Check database credentials and connection strings +5. Review database logs for connection limits or authentication failures + +### `/health` returning 503 + +1. Self-probe failed at startup — pod is alive but not ready +2. Check startup logs for `startup_self_probe_failed` or `self_probe_check` entries +3. Kubernetes will restart the pod automatically (livenessProbe failure) +4. Investigate why dependencies were unreachable at boot time + +### Service refusing to start in SaaS mode + +1. Error message: `DEPLOYMENT_MODE=saas: TLS required for but not configured` +2. `ValidateSaaSTLS` detected a non-TLS connection string +3. Update the connection string to use TLS: + - MongoDB: Use `mongodb+srv://` or add `tls=true` parameter + - PostgreSQL: Change `sslmode` from `disable` to `require` or higher +4. Restart the service + +### In-flight requests killed during deploy + +1. Drain grace period too short +2. Increase grace period past `periodSeconds * failureThreshold` (currently 12s) +3. Verify `/readyz` returns 503 immediately after SIGTERM +4. Check `GracefulShutdownHandler` logs for proper drain sequence + +--- + +## Scope Fence (What's NOT in /readyz) + +| Excluded | Reason | Where it belongs | +|----------|--------|------------------| +| Synthetic business-logic probes | `/readyz` is infrastructure, not application health | Separate `/biz-check` endpoint if needed | +| Certificate validity/expiry | `/readyz` reports TLS posture, not cert health | External cert-monitoring tool | +| Performance SLIs (p99 latency) | `/readyz` is binary healthy/unhealthy | Telemetry dashboards | +| Per-tenant business rules | Global `/readyz` is tenant-agnostic | Future `/readyz/tenant/:id` endpoint | + +--- + +## Common Errors + +| Symptom | Cause | Fix | +|---------|-------|-----| +| 401 on `/readyz` | Endpoint mounted behind auth middleware | Verify `/readyz` is mounted BEFORE auth in `routes.go` | +| Metrics not in `/metrics` | Metrics not registered or registry not exposed | Check `prometheus.MustRegister` calls in `readyz/metrics.go` | +| `selfprobe_result{dep="X"}` stays 0 | `RunSelfProbe` not invoked or dep X unreachable at boot | Check startup logs for `self_probe_check` entries; query `selfprobe_result{dep="mongodb"}` or `selfprobe_result{dep="postgresql"}` individually | +| `/readyz` returns stale results | Cache layer in front (FORBIDDEN) | Remove any caching middleware on `/readyz` | +| Service starts with non-TLS in SaaS mode | `ValidateSaaSTLS` not called or bypassed | Verify `ValidateSaaSTLS(cfg)` is called in bootstrap before connections | + +--- + +## TLS Detection Logic + +TLS detection uses `url.Parse()` (not substring matching) for accuracy: + +| Dependency | TLS Detection Method | +|------------|---------------------| +| MongoDB | Scheme `mongodb+srv://` OR query param `tls=true` OR `ssl=true` | +| PostgreSQL | Query param `sslmode` ∈ {`require`, `verify-ca`, `verify-full`} | + +**Note:** PostgreSQL `sslmode=allow` and `sslmode=prefer` are NOT considered TLS-enabled because they can fall back to cleartext connections. + +Example connection strings: + +```bash +# MongoDB with TLS +MONGO_URI="mongodb+srv://user:pass@cluster.mongodb.net/flowker" +MONGO_URI="mongodb://user:pass@host:27017/flowker?tls=true" + +# PostgreSQL with TLS +AUDIT_DB_SSL_MODE="require" +AUDIT_DB_SSL_MODE="verify-full" +``` + +--- + +## Graceful Shutdown Sequence + +1. **SIGTERM received** → `drainingState.Store(true)` +2. **`/readyz` returns 503** immediately (even if deps are healthy) +3. **K8s stops routing** new traffic to the pod +4. **Grace period (12s)** allows in-flight requests to complete +5. **`server.Shutdown()`** called +6. **Dependencies closed** (MongoDB, PostgreSQL connections) +7. **Process exits** + +--- + +## Files Reference + +| File | Purpose | +|------|---------| +| `internal/adapters/http/in/readyz/handler.go` | `/readyz` handler, response types, status vocabulary, and `IsDrainingFunc` variable | +| `internal/adapters/http/in/readyz/checker.go` | MongoDB and PostgreSQL health checker implementations | +| `internal/adapters/http/in/readyz/metrics.go` | Prometheus metrics registration | +| `internal/adapters/http/in/health/handler.go` | `/health` handler (gated by `SelfProbeOKFunc`) | +| `internal/bootstrap/selfprobe.go` | Startup self-probe implementation | +| `internal/bootstrap/shutdown.go` | Graceful shutdown handler with draining state | +| `internal/bootstrap/tls_detection.go` | TLS detection via url.Parse | +| `internal/bootstrap/tls_enforcement.go` | `ValidateSaaSTLS()` for SaaS mode | diff --git a/go.mod b/go.mod index e83d09d..5c82dc5 100644 --- a/go.mod +++ b/go.mod @@ -119,16 +119,16 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib v1.40.0 // indirect go.opentelemetry.io/contrib/bridges/otelzap v0.17.0 // indirect - go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.18.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect go.opentelemetry.io/otel/log v0.18.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 // indirect go.opentelemetry.io/otel/sdk/log v0.18.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.43.0 go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.1 // indirect diff --git a/internal/adapters/http/in/health/handler.go b/internal/adapters/http/in/health/handler.go index 14dc623..217d2c1 100644 --- a/internal/adapters/http/in/health/handler.go +++ b/internal/adapters/http/in/health/handler.go @@ -12,6 +12,11 @@ import ( "github.com/gofiber/fiber/v2" ) +// SelfProbeOKFunc is a function that returns whether the startup self-probe passed. +// This is set by the bootstrap package to avoid import cycles. +// Default: always returns true (for backward compatibility). +var SelfProbeOKFunc = func() bool { return true } + // Version is set during build via ldflags var Version = "dev" @@ -54,70 +59,29 @@ func NewHealthHandler(dbChecker DatabaseChecker) (*HealthHandler, error) { }, nil } -// Liveness handles liveness probe (GET /health/live) -// Returns 200 OK if the application process is running -// Used by Kubernetes to restart unhealthy pods -// @Summary Kubernetes liveness probe -// @Description Returns 200 OK if the application process is running -// @Tags Health -// @Produce json -// @Success 200 {object} HealthResponse -// @Router /health/live [get] -func (h *HealthHandler) Liveness(c *fiber.Ctx) error { - response := HealthResponse{ - Status: "alive", - Timestamp: time.Now(), - } - - return libHTTP.Respond(c, fiber.StatusOK, response) -} - -// Readiness handles readiness probe (GET /health/ready) -// Returns 200 OK if the application is ready to serve traffic -// Returns 503 Service Unavailable if dependencies are not ready -// Used by load balancers to route traffic -// @Summary Kubernetes readiness probe -// @Description Returns 200 OK if the application is ready to serve traffic -// @Tags Health -// @Produce json -// @Success 200 {object} HealthResponse -// @Failure 503 {object} HealthResponse -// @Router /health/ready [get] -func (h *HealthHandler) Readiness(c *fiber.Ctx) error { - checks := make(map[string]CheckResult) - - // Check database connectivity - dbStatus := h.checkDatabase() - checks["database"] = dbStatus - - // Determine overall status - overallStatus := "ready" - statusCode := fiber.StatusOK - - if dbStatus.Status != "healthy" { - overallStatus = "not_ready" - statusCode = fiber.StatusServiceUnavailable - } - - response := HealthResponse{ - Status: overallStatus, - Timestamp: time.Now(), - Checks: checks, - } - - return libHTTP.Respond(c, statusCode, response) -} - // Health handles combined health check (GET /health) -// Returns detailed health information including uptime and version -// @Summary Combined health check -// @Description Returns detailed health information including uptime and version -// @Tags Health -// @Produce json -// @Success 200 {object} HealthResponse -// @Failure 503 {object} HealthResponse -// @Router /health [get] +// Returns detailed health information including uptime and version. +// GATE: Returns 503 immediately if startup self-probe did not pass. +// NOTE: Infrastructure routes (/health, /readyz) are excluded from OpenAPI spec. func (h *HealthHandler) Health(c *fiber.Ctx) error { + // Gate on self-probe result: if startup probe failed, return 503 immediately. + // This ensures the service doesn't accept traffic until all dependencies + // were verified healthy at startup. + if !SelfProbeOKFunc() { + return libHTTP.Respond(c, fiber.StatusServiceUnavailable, HealthResponse{ + Status: "unhealthy", + Version: Version, + Uptime: time.Since(h.startTime).String(), + Timestamp: time.Now(), + Checks: map[string]CheckResult{ + "self_probe": { + Status: "failed", + Message: "startup self-probe did not pass", + }, + }, + }) + } + checks := make(map[string]CheckResult) // Check database connectivity diff --git a/internal/adapters/http/in/health/handler_test.go b/internal/adapters/http/in/health/handler_test.go index 213ae16..457e4af 100644 --- a/internal/adapters/http/in/health/handler_test.go +++ b/internal/adapters/http/in/health/handler_test.go @@ -7,7 +7,6 @@ package health_test import ( - "errors" "io" "net/http/httptest" "testing" @@ -38,130 +37,9 @@ func TestNewHealthHandler_WithNilDbChecker(t *testing.T) { assert.NotNil(t, handler, "HealthHandler should not be nil even with nil dbChecker") } -func TestHealthHandler_Liveness(t *testing.T) { - ctrl := gomock.NewController(t) - mock := NewMockDatabaseChecker(ctrl) - - handler, err := health.NewHealthHandler(mock) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/live", handler.Liveness) - - req := httptest.NewRequest("GET", "/health/live", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - assert.Equal(t, fiber.StatusOK, resp.StatusCode, "Liveness should return 200 OK") - - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - assert.Contains(t, string(body), "status", "Response should contain status field") - assert.Contains(t, string(body), "alive", "Response should contain alive status") -} - -func TestHealthHandler_Liveness_WithNilDbChecker(t *testing.T) { - handler, err := health.NewHealthHandler(nil) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/live", handler.Liveness) - - req := httptest.NewRequest("GET", "/health/live", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - // Liveness should always return 200 regardless of database state - assert.Equal(t, fiber.StatusOK, resp.StatusCode, "Liveness should return 200 OK even with nil dbChecker") -} - -func TestHealthHandler_Readiness_DatabaseConnected(t *testing.T) { - ctrl := gomock.NewController(t) - mock := NewMockDatabaseChecker(ctrl) - - mock.EXPECT().IsConnected().Return(true) - mock.EXPECT().Ping(gomock.Any()).Return(nil) - - handler, err := health.NewHealthHandler(mock) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/ready", handler.Readiness) - - req := httptest.NewRequest("GET", "/health/ready", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - assert.Equal(t, fiber.StatusOK, resp.StatusCode, "Readiness should return 200 when database connected") - - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - assert.Contains(t, string(body), "status", "Response should contain status field") - assert.Contains(t, string(body), "ready", "Response should contain ready status") -} - -func TestHealthHandler_Readiness_DatabaseNotConnected(t *testing.T) { - ctrl := gomock.NewController(t) - mock := NewMockDatabaseChecker(ctrl) - - mock.EXPECT().IsConnected().Return(false) - - handler, err := health.NewHealthHandler(mock) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/ready", handler.Readiness) - - req := httptest.NewRequest("GET", "/health/ready", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, "Readiness should return 503 when database disconnected") - - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - assert.Contains(t, string(body), "status", "Response should contain status field") - assert.Contains(t, string(body), "not_ready", "Response should contain not_ready status") -} - -func TestHealthHandler_Readiness_DatabasePingFails(t *testing.T) { - ctrl := gomock.NewController(t) - mock := NewMockDatabaseChecker(ctrl) - - mock.EXPECT().IsConnected().Return(true) - mock.EXPECT().Ping(gomock.Any()).Return(errors.New("connection refused")) - - handler, err := health.NewHealthHandler(mock) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/ready", handler.Readiness) - - req := httptest.NewRequest("GET", "/health/ready", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, "Readiness should return 503 when database ping fails") - - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - assert.Contains(t, string(body), "not_ready", "Response should contain not_ready status") - assert.Contains(t, string(body), "connection refused", "Response should contain error message") -} - -func TestHealthHandler_Readiness_WithNilDbChecker(t *testing.T) { - handler, err := health.NewHealthHandler(nil) - require.NoError(t, err) - - app := fiber.New() - app.Get("/health/ready", handler.Readiness) - - req := httptest.NewRequest("GET", "/health/ready", nil) - resp, err := app.Test(req, -1) - require.NoError(t, err) - - assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, "Readiness should return 503 when dbChecker is nil") -} +// NOTE: The old Liveness() and Readiness() methods were removed as part of the +// /readyz implementation. The canonical readiness probe is now /readyz (see readyz package). +// The /health endpoint is the liveness probe, gated by startup self-probe. func TestHealthHandler_Health_DatabaseConnected(t *testing.T) { ctrl := gomock.NewController(t) diff --git a/internal/adapters/http/in/health/selfprobe_integration_test.go b/internal/adapters/http/in/health/selfprobe_integration_test.go new file mode 100644 index 0000000..0ed6595 --- /dev/null +++ b/internal/adapters/http/in/health/selfprobe_integration_test.go @@ -0,0 +1,119 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build integration + +package health_test + +import ( + "io" + "net/http/httptest" + "testing" + + "github.com/LerianStudio/flowker/internal/adapters/http/in/health" + "github.com/gofiber/fiber/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +// TestHealthHandler_Health_SelfProbeGating_WhenFalse tests that /health returns 503 +// when selfProbeOK is false (startup self-probe did not pass). +func TestHealthHandler_Health_SelfProbeGating_WhenFalse(t *testing.T) { + // Arrange: Set self-probe function to return false + originalFunc := health.SelfProbeOKFunc + health.SelfProbeOKFunc = func() bool { return false } + defer func() { health.SelfProbeOKFunc = originalFunc }() + + ctrl := gomock.NewController(t) + mock := NewMockDatabaseChecker(ctrl) + // Note: Database check methods should NOT be called when self-probe is false + // The handler should short-circuit before checking dependencies + + handler, err := health.NewHealthHandler(mock) + require.NoError(t, err) + + app := fiber.New() + app.Get("/health", handler.Health) + + // Act + req := httptest.NewRequest("GET", "/health", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Should return 503 because self-probe failed + assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, + "Health should return 503 when selfProbeOK is false") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + assert.Contains(t, string(body), "unhealthy", "Response should contain unhealthy status") + assert.Contains(t, string(body), "self_probe", "Response should indicate self_probe check failed") +} + +// TestHealthHandler_Health_SelfProbeGating_WhenTrue tests that /health proceeds with +// normal dependency checks when selfProbeOK is true. +func TestHealthHandler_Health_SelfProbeGating_WhenTrue(t *testing.T) { + // Arrange: Set self-probe function to return true + originalFunc := health.SelfProbeOKFunc + health.SelfProbeOKFunc = func() bool { return true } + defer func() { health.SelfProbeOKFunc = originalFunc }() + + ctrl := gomock.NewController(t) + mock := NewMockDatabaseChecker(ctrl) + // When self-probe passed, database check should be called + mock.EXPECT().IsConnected().Return(true) + mock.EXPECT().Ping(gomock.Any()).Return(nil) + + handler, err := health.NewHealthHandler(mock) + require.NoError(t, err) + + app := fiber.New() + app.Get("/health", handler.Health) + + // Act + req := httptest.NewRequest("GET", "/health", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Should return 200 because both self-probe and deps are healthy + assert.Equal(t, fiber.StatusOK, resp.StatusCode, + "Health should return 200 when selfProbeOK is true and deps are up") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + assert.Contains(t, string(body), "healthy", "Response should contain healthy status") +} + +// TestHealthHandler_Health_SelfProbeGating_WhenTrueAndDepsDown tests that /health returns 503 +// when self-probe passed but dependencies are down. +func TestHealthHandler_Health_SelfProbeGating_WhenTrueAndDepsDown(t *testing.T) { + // Arrange: Set self-probe function to return true but deps are down + originalFunc := health.SelfProbeOKFunc + health.SelfProbeOKFunc = func() bool { return true } + defer func() { health.SelfProbeOKFunc = originalFunc }() + + ctrl := gomock.NewController(t) + mock := NewMockDatabaseChecker(ctrl) + // Database is not connected + mock.EXPECT().IsConnected().Return(false) + + handler, err := health.NewHealthHandler(mock) + require.NoError(t, err) + + app := fiber.New() + app.Get("/health", handler.Health) + + // Act + req := httptest.NewRequest("GET", "/health", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Should return 503 because deps are down + assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, + "Health should return 503 when deps are down even if selfProbeOK is true") +} diff --git a/internal/adapters/http/in/readyz/checker.go b/internal/adapters/http/in/readyz/checker.go new file mode 100644 index 0000000..53358b1 --- /dev/null +++ b/internal/adapters/http/in/readyz/checker.go @@ -0,0 +1,163 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package readyz + +import ( + "context" + "net/url" +) + +// MongoDBPinger defines the interface for MongoDB health checks. +// This abstraction allows the readyz package to check MongoDB health +// without importing the bootstrap package directly. +type MongoDBPinger interface { + IsConnected() bool + Ping(ctx context.Context) error +} + +// MongoDBConfig provides MongoDB configuration for TLS detection. +type MongoDBConfig interface { + GetURI() string + GetTLSCACert() string +} + +// MongoDBChecker adapts a MongoDB connection to the HealthChecker interface. +type MongoDBChecker struct { + pinger MongoDBPinger + tlsConfig MongoDBConfig +} + +// NewMongoDBChecker creates a new MongoDBChecker. +func NewMongoDBChecker(pinger MongoDBPinger, tlsConfig MongoDBConfig) *MongoDBChecker { + return &MongoDBChecker{ + pinger: pinger, + tlsConfig: tlsConfig, + } +} + +// Name returns the dependency name. +func (c *MongoDBChecker) Name() string { + return "mongodb" +} + +// Ping checks if MongoDB is reachable. +func (c *MongoDBChecker) Ping(ctx context.Context) error { + if c.pinger == nil { + return &SkippedError{Reason: "mongodb pinger not configured"} + } + + if !c.pinger.IsConnected() { + return errNotConnected + } + + return c.pinger.Ping(ctx) +} + +// IsTLSEnabled returns whether TLS is configured for MongoDB. +// Detection uses URL parsing (not substring matching per anti-pattern #4). +func (c *MongoDBChecker) IsTLSEnabled() bool { + if c.tlsConfig == nil { + return false + } + + // Check if CA cert is configured (explicit TLS) + if c.tlsConfig.GetTLSCACert() != "" { + return true + } + + // Parse URI and check for TLS query parameter + // Anti-pattern #4: MUST NOT use strings.Contains(uri, "tls=true") + uri := c.tlsConfig.GetURI() + if uri == "" { + return false + } + + parsed, err := url.Parse(uri) + if err != nil { + return false + } + + // Check scheme (mongodb+srv always uses TLS) + if parsed.Scheme == "mongodb+srv" { + return true + } + + // Check tls or ssl query parameter (ssl is legacy, equivalent to tls) + query := parsed.Query() + + return query.Get("tls") == "true" || query.Get("ssl") == "true" +} + +// PostgreSQLPinger defines the interface for PostgreSQL health checks. +type PostgreSQLPinger interface { + IsConnected() bool + Ping(ctx context.Context) error +} + +// PostgreSQLConfig provides PostgreSQL configuration for TLS detection. +type PostgreSQLConfig interface { + GetSSLMode() string +} + +// PostgreSQLChecker adapts a PostgreSQL connection to the HealthChecker interface. +type PostgreSQLChecker struct { + pinger PostgreSQLPinger + tlsConfig PostgreSQLConfig +} + +// NewPostgreSQLChecker creates a new PostgreSQLChecker. +func NewPostgreSQLChecker(pinger PostgreSQLPinger, tlsConfig PostgreSQLConfig) *PostgreSQLChecker { + return &PostgreSQLChecker{ + pinger: pinger, + tlsConfig: tlsConfig, + } +} + +// Name returns the dependency name. +func (c *PostgreSQLChecker) Name() string { + return "postgresql" +} + +// Ping checks if PostgreSQL is reachable. +func (c *PostgreSQLChecker) Ping(ctx context.Context) error { + if c.pinger == nil { + return &SkippedError{Reason: "postgresql pinger not configured"} + } + + if !c.pinger.IsConnected() { + return errNotConnected + } + + return c.pinger.Ping(ctx) +} + +// IsTLSEnabled returns whether TLS is configured for PostgreSQL. +func (c *PostgreSQLChecker) IsTLSEnabled() bool { + if c.tlsConfig == nil { + return false + } + + sslMode := c.tlsConfig.GetSSLMode() + + // sslmode values that enable TLS: require, verify-ca, verify-full + // sslmode values that don't require TLS: disable, allow, prefer + switch sslMode { + case "require", "verify-ca", "verify-full": + return true + default: + return false + } +} + +// errNotConnected is returned when the database is not connected. +var errNotConnected = &connectionError{message: "not connected"} + +type connectionError struct { + message string +} + +func (e *connectionError) Error() string { + return e.message +} diff --git a/internal/adapters/http/in/readyz/checker_test.go b/internal/adapters/http/in/readyz/checker_test.go new file mode 100644 index 0000000..d2834b8 --- /dev/null +++ b/internal/adapters/http/in/readyz/checker_test.go @@ -0,0 +1,197 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package readyz_test + +import ( + "context" + "errors" + "testing" + + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" + "github.com/stretchr/testify/assert" +) + +// --- MongoDB Checker Tests --- + +type mockMongoPinger struct { + connected bool + pingErr error +} + +func (m *mockMongoPinger) IsConnected() bool { return m.connected } +func (m *mockMongoPinger) Ping(ctx context.Context) error { return m.pingErr } + +type mockMongoConfig struct { + uri string + tlsCACert string +} + +func (m *mockMongoConfig) GetURI() string { return m.uri } +func (m *mockMongoConfig) GetTLSCACert() string { return m.tlsCACert } + +func TestMongoDBChecker_Name(t *testing.T) { + checker := readyz.NewMongoDBChecker(nil, nil) + assert.Equal(t, "mongodb", checker.Name()) +} + +func TestMongoDBChecker_Ping_Success(t *testing.T) { + pinger := &mockMongoPinger{connected: true, pingErr: nil} + checker := readyz.NewMongoDBChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.NoError(t, err) +} + +func TestMongoDBChecker_Ping_NotConnected(t *testing.T) { + pinger := &mockMongoPinger{connected: false} + checker := readyz.NewMongoDBChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "not connected") +} + +func TestMongoDBChecker_Ping_PingFails(t *testing.T) { + pinger := &mockMongoPinger{connected: true, pingErr: errors.New("connection refused")} + checker := readyz.NewMongoDBChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "connection refused") +} + +func TestMongoDBChecker_Ping_NilPinger(t *testing.T) { + checker := readyz.NewMongoDBChecker(nil, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + + var skippedErr *readyz.SkippedError + assert.True(t, errors.As(err, &skippedErr)) +} + +func TestMongoDBChecker_IsTLSEnabled_WithCACert(t *testing.T) { + config := &mockMongoConfig{tlsCACert: "base64-encoded-cert"} + checker := readyz.NewMongoDBChecker(nil, config) + + assert.True(t, checker.IsTLSEnabled()) +} + +func TestMongoDBChecker_IsTLSEnabled_WithMongoDBSRV(t *testing.T) { + config := &mockMongoConfig{uri: "mongodb+srv://user:pass@cluster.example.com/db"} + checker := readyz.NewMongoDBChecker(nil, config) + + assert.True(t, checker.IsTLSEnabled()) +} + +func TestMongoDBChecker_IsTLSEnabled_WithTLSQueryParam(t *testing.T) { + // Anti-pattern #4: Must use url.Parse, not strings.Contains + config := &mockMongoConfig{uri: "mongodb://host:27017/db?tls=true"} + checker := readyz.NewMongoDBChecker(nil, config) + + assert.True(t, checker.IsTLSEnabled()) +} + +func TestMongoDBChecker_IsTLSEnabled_NoTLS(t *testing.T) { + config := &mockMongoConfig{uri: "mongodb://localhost:27017/db"} + checker := readyz.NewMongoDBChecker(nil, config) + + assert.False(t, checker.IsTLSEnabled()) +} + +func TestMongoDBChecker_IsTLSEnabled_NilConfig(t *testing.T) { + checker := readyz.NewMongoDBChecker(nil, nil) + + assert.False(t, checker.IsTLSEnabled()) +} + +// --- PostgreSQL Checker Tests --- + +type mockPostgresPinger struct { + connected bool + pingErr error +} + +func (m *mockPostgresPinger) IsConnected() bool { return m.connected } +func (m *mockPostgresPinger) Ping(ctx context.Context) error { return m.pingErr } + +type mockPostgresConfig struct { + sslMode string +} + +func (m *mockPostgresConfig) GetSSLMode() string { return m.sslMode } + +func TestPostgreSQLChecker_Name(t *testing.T) { + checker := readyz.NewPostgreSQLChecker(nil, nil) + assert.Equal(t, "postgresql", checker.Name()) +} + +func TestPostgreSQLChecker_Ping_Success(t *testing.T) { + pinger := &mockPostgresPinger{connected: true, pingErr: nil} + checker := readyz.NewPostgreSQLChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.NoError(t, err) +} + +func TestPostgreSQLChecker_Ping_NotConnected(t *testing.T) { + pinger := &mockPostgresPinger{connected: false} + checker := readyz.NewPostgreSQLChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "not connected") +} + +func TestPostgreSQLChecker_Ping_PingFails(t *testing.T) { + pinger := &mockPostgresPinger{connected: true, pingErr: errors.New("connection refused")} + checker := readyz.NewPostgreSQLChecker(pinger, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + assert.Contains(t, err.Error(), "connection refused") +} + +func TestPostgreSQLChecker_Ping_NilPinger(t *testing.T) { + checker := readyz.NewPostgreSQLChecker(nil, nil) + + err := checker.Ping(context.Background()) + assert.Error(t, err) + + var skippedErr *readyz.SkippedError + assert.True(t, errors.As(err, &skippedErr)) +} + +func TestPostgreSQLChecker_IsTLSEnabled_SSLModes(t *testing.T) { + testCases := []struct { + sslMode string + expected bool + }{ + {"require", true}, + {"verify-ca", true}, + {"verify-full", true}, + {"disable", false}, + {"allow", false}, + {"prefer", false}, + {"", false}, + } + + for _, tc := range testCases { + t.Run(tc.sslMode, func(t *testing.T) { + config := &mockPostgresConfig{sslMode: tc.sslMode} + checker := readyz.NewPostgreSQLChecker(nil, config) + + assert.Equal(t, tc.expected, checker.IsTLSEnabled()) + }) + } +} + +func TestPostgreSQLChecker_IsTLSEnabled_NilConfig(t *testing.T) { + checker := readyz.NewPostgreSQLChecker(nil, nil) + + assert.False(t, checker.IsTLSEnabled()) +} diff --git a/internal/adapters/http/in/readyz/drain_test.go b/internal/adapters/http/in/readyz/drain_test.go new file mode 100644 index 0000000..f544f03 --- /dev/null +++ b/internal/adapters/http/in/readyz/drain_test.go @@ -0,0 +1,191 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package readyz_test + +import ( + "encoding/json" + "io" + "net/http/httptest" + "testing" + + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" + "github.com/gofiber/fiber/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestReadyzHandler_DrainState_WhenDraining tests that /readyz returns 503 +// immediately when draining state is true (graceful shutdown in progress). +func TestReadyzHandler_DrainState_WhenDraining(t *testing.T) { + // Arrange: Set draining function to return true + originalFunc := readyz.IsDrainingFunc + readyz.IsDrainingFunc = func() bool { return true } + defer func() { readyz.IsDrainingFunc = originalFunc }() + + // Even with healthy dependencies, should return 503 when draining + mongoChecker := &mockHealthChecker{ + name: "mongodb", + pingErr: nil, + tlsEnabled: true, + } + + handler := readyz.NewHandler( + readyz.WithChecker(mongoChecker), + readyz.WithVersion("1.2.3"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Status code should be 503 due to draining state + assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, + "Should return 503 when draining state is true") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Status should be "unhealthy" when draining + assert.Equal(t, "unhealthy", response.Status, + "Top-level status should be 'unhealthy' when draining") +} + +// TestReadyzHandler_DrainState_WhenNotDraining tests that /readyz proceeds with +// normal dependency checks when draining state is false. +func TestReadyzHandler_DrainState_WhenNotDraining(t *testing.T) { + // Arrange: Set draining function to return false + originalFunc := readyz.IsDrainingFunc + readyz.IsDrainingFunc = func() bool { return false } + defer func() { readyz.IsDrainingFunc = originalFunc }() + + mongoChecker := &mockHealthChecker{ + name: "mongodb", + pingErr: nil, + tlsEnabled: true, + } + + handler := readyz.NewHandler( + readyz.WithChecker(mongoChecker), + readyz.WithVersion("1.2.3"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Should return 200 when not draining and deps are up + assert.Equal(t, fiber.StatusOK, resp.StatusCode, + "Should return 200 when draining state is false and deps are up") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + assert.Equal(t, "healthy", response.Status, + "Top-level status should be 'healthy' when not draining and deps are up") +} + +// TestReadyzHandler_DrainState_ShortCircuits tests that drain check happens +// BEFORE dependency checks (no deps should be pinged when draining). +func TestReadyzHandler_DrainState_ShortCircuits(t *testing.T) { + // Arrange: Set draining function to return true + originalFunc := readyz.IsDrainingFunc + readyz.IsDrainingFunc = func() bool { return true } + defer func() { readyz.IsDrainingFunc = originalFunc }() + + // Use a checker that would fail - but it should never be called + checker := &mockHealthChecker{ + name: "should-not-be-checked", + pingErr: nil, // Would return healthy if called + tlsEnabled: false, + } + + handler := readyz.NewHandler( + readyz.WithChecker(checker), + readyz.WithVersion("1.2.3"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Should return 503 immediately + assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, + "Should return 503 immediately when draining") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Checks map should be empty because drain check short-circuits + assert.Empty(t, response.Checks, + "Checks map should be empty when drain short-circuits") +} + +// TestReadyzHandler_DrainState_PreservesVersionAndDeploymentMode tests that +// even when draining, version and deployment_mode are included in response. +func TestReadyzHandler_DrainState_PreservesVersionAndDeploymentMode(t *testing.T) { + // Arrange: Set draining function to return true + originalFunc := readyz.IsDrainingFunc + readyz.IsDrainingFunc = func() bool { return true } + defer func() { readyz.IsDrainingFunc = originalFunc }() + + handler := readyz.NewHandler( + readyz.WithVersion("2.0.0-drain-test"), + readyz.WithDeploymentMode("saas"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Assert: Version and deployment mode should still be present + assert.Equal(t, "2.0.0-drain-test", response.Version, + "Version should be preserved when draining") + assert.Equal(t, "saas", response.DeploymentMode, + "DeploymentMode should be preserved when draining") +} diff --git a/internal/adapters/http/in/readyz/handler.go b/internal/adapters/http/in/readyz/handler.go new file mode 100644 index 0000000..d63b95b --- /dev/null +++ b/internal/adapters/http/in/readyz/handler.go @@ -0,0 +1,367 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +// Package readyz implements the canonical /readyz endpoint following Ring Standards. +// The response contract is non-negotiable and matches the Lerian /readyz specification exactly. +package readyz + +import ( + "context" + "errors" + "os" + "sync" + "time" + + libHTTP "github.com/LerianStudio/lib-commons/v5/commons/net/http" + "github.com/gofiber/fiber/v2" +) + +// Version is set during build via ldflags. Used in /readyz response. +var Version = "dev" + +// IsDrainingFunc is a function that returns whether the service is draining. +// This is set by the bootstrap package to avoid import cycles. +// Default: always returns false (not draining). +var IsDrainingFunc = func() bool { return false } + +// HealthChecker defines the interface for dependency health checks. +// Each dependency (MongoDB, PostgreSQL, Redis, etc.) must implement this interface. +type HealthChecker interface { + // Name returns the dependency name for the checks map key (e.g., "mongodb", "postgresql") + Name() string + + // Ping checks if the dependency is reachable. + // Returns nil for "up", error for "down". + // Special error types: + // - *SkippedError: returns "skipped" status (requires Reason) + // - *NotApplicableError: returns "n/a" status (requires Reason) + // - *DegradedError: returns "degraded" status (circuit breaker half-open) + Ping(ctx context.Context) error + + // IsTLSEnabled returns whether TLS is configured for this dependency. + // This reflects configured posture, not runtime certificate validity. + IsTLSEnabled() bool +} + +// SkippedError indicates the dependency is intentionally skipped (disabled via config). +type SkippedError struct { + Reason string +} + +func (e *SkippedError) Error() string { return "skipped: " + e.Reason } + +// NotApplicableError indicates the dependency is not applicable in current mode. +type NotApplicableError struct { + Reason string +} + +func (e *NotApplicableError) Error() string { return "n/a: " + e.Reason } + +// DegradedError indicates the dependency is in degraded state (e.g., circuit breaker half-open). +type DegradedError struct { + BreakerState string +} + +func (e *DegradedError) Error() string { return "degraded: " + e.BreakerState } + +// Response is the canonical /readyz response contract. +// This shape is NON-NEGOTIABLE per Ring Standards. +type Response struct { + Status string `json:"status"` // "healthy" or "unhealthy" + Checks map[string]DependencyCheck `json:"checks"` + Version string `json:"version"` + DeploymentMode string `json:"deployment_mode"` +} + +// DependencyCheck represents a single dependency's health status. +type DependencyCheck struct { + Status string `json:"status"` // up/down/degraded/skipped/n/a (closed set) + LatencyMs int64 `json:"latency_ms,omitempty"` // when status is up or degraded + TLS bool `json:"tls,omitempty"` // when dependency supports TLS + Error string `json:"error,omitempty"` // when status is down or degraded + Reason string `json:"reason,omitempty"` // when status is skipped or n/a + BreakerState string `json:"breaker_state,omitempty"` // when dep is breaker-wrapped +} + +// Handler handles the /readyz endpoint. +type Handler struct { + checkers []HealthChecker + version string + deploymentMode string + checkTimeout time.Duration + + // Note: Drain state is now checked via IsDrainingFunc() in the Readyz handler. + // When draining, /readyz immediately returns 503 to signal K8s to stop routing traffic. +} + +// Option configures the Handler. +type Option func(*Handler) + +// WithChecker adds a health checker to the handler. +func WithChecker(checker HealthChecker) Option { + return func(h *Handler) { + h.checkers = append(h.checkers, checker) + } +} + +// WithVersion sets the version to include in the response. +func WithVersion(version string) Option { + return func(h *Handler) { + h.version = version + } +} + +// WithDeploymentMode sets the deployment mode to include in the response. +func WithDeploymentMode(mode string) Option { + return func(h *Handler) { + h.deploymentMode = mode + } +} + +// WithCheckTimeout sets the per-dependency check timeout. +func WithCheckTimeout(timeout time.Duration) Option { + return func(h *Handler) { + h.checkTimeout = timeout + } +} + +// NewHandler creates a new Handler with the given options. +func NewHandler(opts ...Option) *Handler { + h := &Handler{ + checkers: make([]HealthChecker, 0), + version: getVersionFromEnv(), + deploymentMode: getDeploymentModeFromEnv(), + checkTimeout: 2 * time.Second, // Default per Ring Standards + } + + for _, opt := range opts { + opt(h) + } + + return h +} + +// Readyz handles GET /readyz requests. +// Returns 200 with status "healthy" iff every check is up/skipped/n/a. +// Returns 503 with status "unhealthy" if ANY check is down or degraded. +// GATE: Returns 503 immediately if service is draining (graceful shutdown). +// +// @Summary Kubernetes readiness probe +// @Description Returns dependency health status following canonical contract +// @Tags Health +// @Accept json +// @Produce json +// @Success 200 {object} Response "Service is healthy - all dependencies up/skipped/n-a" +// @Failure 503 {object} Response "Service is unhealthy - one or more dependencies down/degraded" +// @Router /readyz [get] +func (h *Handler) Readyz(c *fiber.Ctx) error { + // Check draining state first - short-circuit to 503 during graceful shutdown. + // This signals K8s to stop routing new traffic to this pod. + if IsDrainingFunc() { + return libHTTP.Respond(c, fiber.StatusServiceUnavailable, Response{ + Status: "unhealthy", + Checks: map[string]DependencyCheck{}, + Version: h.version, + DeploymentMode: h.deploymentMode, + }) + } + + checks := h.runChecks(c.Context()) + + // Aggregation rule: 503 iff any check is "down" or "degraded" + status := "healthy" + statusCode := fiber.StatusOK + + for _, check := range checks { + if check.Status == "down" || check.Status == "degraded" { + status = "unhealthy" + statusCode = fiber.StatusServiceUnavailable + + break + } + } + + response := Response{ + Status: status, + Checks: checks, + Version: h.version, + DeploymentMode: h.deploymentMode, + } + + return libHTTP.Respond(c, statusCode, response) +} + +// runChecks executes all health checks in parallel with per-dep timeout. +func (h *Handler) runChecks(ctx context.Context) map[string]DependencyCheck { + checks := make(map[string]DependencyCheck) + + if len(h.checkers) == 0 { + return checks + } + + var ( + mu sync.Mutex + wg sync.WaitGroup + ) + + for _, checker := range h.checkers { + wg.Add(1) + + go func(chk HealthChecker) { + defer wg.Done() + + check := h.runSingleCheck(ctx, chk) + + mu.Lock() + checks[chk.Name()] = check + mu.Unlock() + }(checker) + } + + wg.Wait() + + return checks +} + +// runSingleCheck executes a single health check with timeout. +// Emits readyz_check_duration_ms and readyz_check_status metrics for every check. +func (h *Handler) runSingleCheck(ctx context.Context, checker HealthChecker) DependencyCheck { + checkCtx, cancel := context.WithTimeout(ctx, h.checkTimeout) + defer cancel() + + start := time.Now() + err := checker.Ping(checkCtx) + duration := time.Since(start) + latency := duration.Milliseconds() + + check := DependencyCheck{ + TLS: checker.IsTLSEnabled(), + } + + switch { + case err == nil: + // Status: up + check.Status = "up" + check.LatencyMs = latency + + case errors.Is(err, context.DeadlineExceeded): + // Timeout = down + check.Status = "down" + check.Error = "timeout exceeded" + + default: + // Check for special error types + var ( + skippedErr *SkippedError + naErr *NotApplicableError + degradedErr *DegradedError + ) + + switch { + case errors.As(err, &skippedErr): + check.Status = "skipped" + check.Reason = skippedErr.Reason + // No latency for skipped + + case errors.As(err, &naErr): + check.Status = "n/a" + check.Reason = naErr.Reason + // No latency for n/a + + case errors.As(err, °radedErr): + check.Status = "degraded" + check.BreakerState = degradedErr.BreakerState + check.LatencyMs = latency + + default: + // Generic error = down + check.Status = "down" + check.Error = err.Error() + } + } + + // Emit readyz metrics for every check (per metrics contract) + depName := checker.Name() + EmitCheckDuration(ctx, depName, check.Status, duration) + EmitCheckStatus(ctx, depName, check.Status) + + return check +} + +// SelfProbeResult represents the result of a dependency's self-probe. +type SelfProbeResult struct { + Name string // Dependency name (e.g., "mongodb", "postgresql") + Status string // up, down, skipped + Err error // Non-nil when status is "down" +} + +// RunSelfProbe executes health checks for all dependencies and emits selfprobe_result metrics. +// This is called once at startup after telemetry is initialized. +// Unlike Readyz, this only emits the gauge metric for monitoring startup health state. +// Returns the results for each dependency so the caller can determine overall health. +func (h *Handler) RunSelfProbe(ctx context.Context) []SelfProbeResult { + results := make([]SelfProbeResult, 0, len(h.checkers)) + + for _, checker := range h.checkers { + checkCtx, cancel := context.WithTimeout(ctx, h.checkTimeout) + err := checker.Ping(checkCtx) + + cancel() + + result := SelfProbeResult{ + Name: checker.Name(), + } + + if err == nil { + result.Status = "up" + } else { + // Check for special error types (same handling as runCheck) + var ( + skippedErr *SkippedError + naErr *NotApplicableError + degradedErr *DegradedError + ) + + switch { + case errors.As(err, &skippedErr): + result.Status = "skipped" + case errors.As(err, &naErr): + result.Status = "n/a" + case errors.As(err, °radedErr): + result.Status = "degraded" + result.Err = err + default: + result.Status = "down" + result.Err = err + } + } + + results = append(results, result) + + // Emit selfprobe_result metric: 1.0 for healthy (up or skipped), 0.0 for unhealthy + // "skipped" is healthy because it means the dependency was intentionally disabled + isHealthy := result.Status == "up" || result.Status == "skipped" + EmitSelfProbeResult(ctx, checker.Name(), isHealthy) + } + + return results +} + +// getVersionFromEnv returns version from env or default. +func getVersionFromEnv() string { + if v := os.Getenv("VERSION"); v != "" { + return v + } + + return Version +} + +// getDeploymentModeFromEnv returns deployment mode from env or default. +func getDeploymentModeFromEnv() string { + if m := os.Getenv("DEPLOYMENT_MODE"); m != "" { + return m + } + + return "local" +} diff --git a/internal/adapters/http/in/readyz/handler_test.go b/internal/adapters/http/in/readyz/handler_test.go new file mode 100644 index 0000000..420ae03 --- /dev/null +++ b/internal/adapters/http/in/readyz/handler_test.go @@ -0,0 +1,478 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package readyz_test + +import ( + "context" + "encoding/json" + "errors" + "io" + "net/http/httptest" + "testing" + + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" + "github.com/gofiber/fiber/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// mockHealthChecker implements readyz.HealthChecker for testing +type mockHealthChecker struct { + name string + pingErr error + tlsEnabled bool +} + +func (m *mockHealthChecker) Name() string { + return m.name +} + +func (m *mockHealthChecker) Ping(ctx context.Context) error { + return m.pingErr +} + +func (m *mockHealthChecker) IsTLSEnabled() bool { + return m.tlsEnabled +} + +func TestReadyzHandler_AllDepsUp_Returns200WithCorrectShape(t *testing.T) { + // Arrange: All dependencies are healthy + mongoChecker := &mockHealthChecker{ + name: "mongodb", + pingErr: nil, + tlsEnabled: true, + } + postgresChecker := &mockHealthChecker{ + name: "postgresql", + pingErr: nil, + tlsEnabled: false, + } + + handler := readyz.NewHandler( + readyz.WithChecker(mongoChecker), + readyz.WithChecker(postgresChecker), + readyz.WithVersion("1.2.3"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Status code is 200 + assert.Equal(t, fiber.StatusOK, resp.StatusCode, "Should return 200 when all deps are up") + + // Assert: Response shape matches canonical contract + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err, "Response should be valid JSON") + + // Verify canonical contract fields + assert.Equal(t, "healthy", response.Status, "Top-level status should be 'healthy'") + assert.Equal(t, "1.2.3", response.Version, "Version should be set") + assert.Equal(t, "local", response.DeploymentMode, "Deployment mode should be set") + assert.NotNil(t, response.Checks, "Checks map should not be nil") + + // Verify MongoDB check + mongoCheck, ok := response.Checks["mongodb"] + require.True(t, ok, "Should have mongodb check") + assert.Equal(t, "up", mongoCheck.Status, "MongoDB status should be 'up'") + assert.True(t, mongoCheck.TLS, "MongoDB TLS should be true") + assert.GreaterOrEqual(t, mongoCheck.LatencyMs, int64(0), "LatencyMs should be set when up") + + // Verify PostgreSQL check + pgCheck, ok := response.Checks["postgresql"] + require.True(t, ok, "Should have postgresql check") + assert.Equal(t, "up", pgCheck.Status, "PostgreSQL status should be 'up'") + assert.False(t, pgCheck.TLS, "PostgreSQL TLS should be false") +} + +func TestReadyzHandler_OneDepDown_Returns503(t *testing.T) { + // Arrange: MongoDB is up, PostgreSQL is down + mongoChecker := &mockHealthChecker{ + name: "mongodb", + pingErr: nil, + tlsEnabled: true, + } + postgresChecker := &mockHealthChecker{ + name: "postgresql", + pingErr: errors.New("connection refused"), + tlsEnabled: false, + } + + handler := readyz.NewHandler( + readyz.WithChecker(mongoChecker), + readyz.WithChecker(postgresChecker), + readyz.WithVersion("1.2.3"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Assert: Status code is 503 + assert.Equal(t, fiber.StatusServiceUnavailable, resp.StatusCode, "Should return 503 when any dep is down") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Top-level status should be unhealthy + assert.Equal(t, "unhealthy", response.Status, "Top-level status should be 'unhealthy'") + + // MongoDB should be up + mongoCheck, ok := response.Checks["mongodb"] + require.True(t, ok, "Should have mongodb check") + assert.Equal(t, "up", mongoCheck.Status) + + // PostgreSQL should be down with error + pgCheck, ok := response.Checks["postgresql"] + require.True(t, ok, "Should have postgresql check") + assert.Equal(t, "down", pgCheck.Status, "PostgreSQL status should be 'down'") + assert.Contains(t, pgCheck.Error, "connection refused", "Should include error message") +} + +func TestReadyzHandler_ResponseIncludesVersionAndDeploymentMode(t *testing.T) { + // Arrange + handler := readyz.NewHandler( + readyz.WithVersion("2.0.0-beta"), + readyz.WithDeploymentMode("saas"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Assert + assert.Equal(t, "2.0.0-beta", response.Version, "Version should match") + assert.Equal(t, "saas", response.DeploymentMode, "Deployment mode should match") +} + +func TestReadyzHandler_NoAuthRequired_NeverReturnsUnauthorized(t *testing.T) { + // This test verifies that /readyz is a public endpoint (mounted BEFORE auth middleware) + // by checking that it never returns 401/403, regardless of authentication state. + testCases := []struct { + name string + authHeader string + depUp bool + }{ + {"no auth header, deps up", "", true}, + {"no auth header, deps down", "", false}, + {"invalid auth header, deps up", "Bearer invalid-token", true}, + {"invalid auth header, deps down", "Bearer invalid-token", false}, + {"malformed auth header, deps up", "malformed", true}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + checker := &mockHealthChecker{ + name: "mongodb", + } + if !tc.depUp { + checker.pingErr = errors.New("down") + } + + handler := readyz.NewHandler( + readyz.WithChecker(checker), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + if tc.authHeader != "" { + req.Header.Set("Authorization", tc.authHeader) + } + + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + // Should NEVER return 401 or 403 + assert.NotEqual(t, fiber.StatusUnauthorized, resp.StatusCode, "Should never return 401") + assert.NotEqual(t, fiber.StatusForbidden, resp.StatusCode, "Should never return 403") + + // Should only return 200 or 503 + assert.True(t, + resp.StatusCode == fiber.StatusOK || resp.StatusCode == fiber.StatusServiceUnavailable, + "Should return 200 or 503, got %d", resp.StatusCode) + }) + } +} + +func TestReadyzHandler_ReportsCorrectTLSPosture(t *testing.T) { + // Arrange: Mix of TLS enabled/disabled dependencies + tlsChecker := &mockHealthChecker{ + name: "secure-db", + pingErr: nil, + tlsEnabled: true, + } + noTLSChecker := &mockHealthChecker{ + name: "insecure-db", + pingErr: nil, + tlsEnabled: false, + } + + handler := readyz.NewHandler( + readyz.WithChecker(tlsChecker), + readyz.WithChecker(noTLSChecker), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + // Act + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + + var response readyz.Response + err = json.Unmarshal(body, &response) + require.NoError(t, err) + + // Assert TLS posture is correctly reported + secureCheck := response.Checks["secure-db"] + assert.True(t, secureCheck.TLS, "Secure DB should report TLS=true") + + insecureCheck := response.Checks["insecure-db"] + assert.False(t, insecureCheck.TLS, "Insecure DB should report TLS=false") +} + +func TestReadyzHandler_StatusVocabulary(t *testing.T) { + // Test the five valid status values: up, down, degraded, skipped, n/a + testCases := []struct { + name string + checkStatus string + expectedStatus string + expectedHTTP int + }{ + {"up status", "up", "healthy", fiber.StatusOK}, + {"skipped status", "skipped", "healthy", fiber.StatusOK}, + {"n/a status", "n/a", "healthy", fiber.StatusOK}, + {"down status", "down", "unhealthy", fiber.StatusServiceUnavailable}, + {"degraded status", "degraded", "unhealthy", fiber.StatusServiceUnavailable}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var checker readyz.HealthChecker + + switch tc.checkStatus { + case "up": + checker = &mockHealthChecker{name: "test-dep", pingErr: nil} + case "down": + checker = &mockHealthChecker{name: "test-dep", pingErr: errors.New("connection failed")} + case "skipped": + checker = &mockSkippedChecker{name: "test-dep", reason: "FEATURE_DISABLED=true"} + case "n/a": + checker = &mockNAChecker{name: "test-dep", reason: "multi-tenant: see /readyz/tenant/:id"} + case "degraded": + checker = &mockDegradedChecker{name: "test-dep", breakerState: "half-open"} + } + + handler := readyz.NewHandler(readyz.WithChecker(checker)) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, tc.expectedHTTP, resp.StatusCode) + + body, _ := io.ReadAll(resp.Body) + var response readyz.Response + _ = json.Unmarshal(body, &response) + + assert.Equal(t, tc.expectedStatus, response.Status) + }) + } +} + +func TestReadyzHandler_SkippedRequiresReason(t *testing.T) { + checker := &mockSkippedChecker{ + name: "optional-cache", + reason: "REDIS_ENABLED=false", + } + + handler := readyz.NewHandler(readyz.WithChecker(checker)) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + var response readyz.Response + _ = json.Unmarshal(body, &response) + + check := response.Checks["optional-cache"] + assert.Equal(t, "skipped", check.Status) + assert.Equal(t, "REDIS_ENABLED=false", check.Reason, "Skipped status MUST include reason") +} + +func TestReadyzHandler_NARequiresReason(t *testing.T) { + checker := &mockNAChecker{ + name: "tenant-db", + reason: "multi-tenant: see /readyz/tenant/:id", + } + + handler := readyz.NewHandler(readyz.WithChecker(checker)) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + var response readyz.Response + _ = json.Unmarshal(body, &response) + + check := response.Checks["tenant-db"] + assert.Equal(t, "n/a", check.Status) + assert.Equal(t, "multi-tenant: see /readyz/tenant/:id", check.Reason, "N/A status MUST include reason") +} + +func TestReadyzHandler_LatencyMsOmittedForSkippedAndNA(t *testing.T) { + skippedChecker := &mockSkippedChecker{name: "skipped-dep", reason: "disabled"} + naChecker := &mockNAChecker{name: "na-dep", reason: "not applicable"} + + handler := readyz.NewHandler( + readyz.WithChecker(skippedChecker), + readyz.WithChecker(naChecker), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + + // Parse as raw JSON to check omitted fields + var raw map[string]interface{} + _ = json.Unmarshal(body, &raw) + + checks := raw["checks"].(map[string]interface{}) + + // Skipped check should NOT have latency_ms + skippedCheck := checks["skipped-dep"].(map[string]interface{}) + _, hasLatency := skippedCheck["latency_ms"] + assert.False(t, hasLatency, "Skipped status should NOT have latency_ms") + + // N/A check should NOT have latency_ms + naCheck := checks["na-dep"].(map[string]interface{}) + _, hasLatency = naCheck["latency_ms"] + assert.False(t, hasLatency, "N/A status should NOT have latency_ms") +} + +func TestReadyzHandler_NoDepsReturnsHealthy(t *testing.T) { + // Edge case: no checkers configured + handler := readyz.NewHandler( + readyz.WithVersion("1.0.0"), + readyz.WithDeploymentMode("local"), + ) + + app := fiber.New() + app.Get("/readyz", handler.Readyz) + + req := httptest.NewRequest("GET", "/readyz", nil) + resp, err := app.Test(req, -1) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, fiber.StatusOK, resp.StatusCode, "No deps should return 200") + + body, _ := io.ReadAll(resp.Body) + var response readyz.Response + _ = json.Unmarshal(body, &response) + + assert.Equal(t, "healthy", response.Status) + assert.Empty(t, response.Checks) +} + +// --- Mock implementations for special status types --- + +// mockSkippedChecker implements HealthChecker that always returns "skipped" status +type mockSkippedChecker struct { + name string + reason string +} + +func (m *mockSkippedChecker) Name() string { return m.name } +func (m *mockSkippedChecker) Ping(ctx context.Context) error { + return &readyz.SkippedError{Reason: m.reason} +} +func (m *mockSkippedChecker) IsTLSEnabled() bool { return false } + +// mockNAChecker implements HealthChecker that always returns "n/a" status +type mockNAChecker struct { + name string + reason string +} + +func (m *mockNAChecker) Name() string { return m.name } +func (m *mockNAChecker) Ping(ctx context.Context) error { + return &readyz.NotApplicableError{Reason: m.reason} +} +func (m *mockNAChecker) IsTLSEnabled() bool { return false } + +// mockDegradedChecker implements HealthChecker that returns "degraded" status +type mockDegradedChecker struct { + name string + breakerState string +} + +func (m *mockDegradedChecker) Name() string { return m.name } +func (m *mockDegradedChecker) Ping(ctx context.Context) error { + return &readyz.DegradedError{BreakerState: m.breakerState} +} +func (m *mockDegradedChecker) IsTLSEnabled() bool { return false } diff --git a/internal/adapters/http/in/readyz/metrics.go b/internal/adapters/http/in/readyz/metrics.go new file mode 100644 index 0000000..7e8466b --- /dev/null +++ b/internal/adapters/http/in/readyz/metrics.go @@ -0,0 +1,131 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package readyz + +import ( + "context" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +const meterName = "github.com/LerianStudio/flowker/readyz" + +var ( + metricsOnce sync.Once + + checkDuration metric.Float64Histogram + checkStatus metric.Int64Counter + selfProbe metric.Float64Gauge +) + +// Histogram bucket boundaries in milliseconds. +// Covers cache-fast (1ms) to timeout-slow (5000ms) per readyz metrics contract. +var histogramBuckets = []float64{1, 5, 10, 25, 50, 100, 250, 500, 1000, 2000, 5000} + +// InitMetrics initializes the readyz metrics. +// This function is idempotent and safe to call multiple times. +// Call once at application startup after telemetry is initialized. +func InitMetrics() error { + var initErr error + + metricsOnce.Do(func() { + meter := otel.GetMeterProvider().Meter(meterName) + + // Histogram for check duration + checkDuration, initErr = meter.Float64Histogram( + "readyz_check_duration_ms", + metric.WithDescription("Duration of /readyz dependency checks in milliseconds"), + metric.WithUnit("ms"), + metric.WithExplicitBucketBoundaries(histogramBuckets...), + ) + if initErr != nil { + return + } + + // Counter for check status + checkStatus, initErr = meter.Int64Counter( + "readyz_check_status", + metric.WithDescription("Count of /readyz check outcomes"), + ) + if initErr != nil { + return + } + + // Gauge for self-probe result + selfProbe, initErr = meter.Float64Gauge( + "selfprobe_result", + metric.WithDescription("Last self-probe result per dependency (1=up, 0=down)"), + ) + }) + + return initErr +} + +// EmitCheckDuration records the duration of a dependency check. +// Labels: dep (dependency name), status (check outcome: up/down/degraded/skipped/n/a). +// Duration is recorded in milliseconds per the readyz metrics contract. +func EmitCheckDuration(ctx context.Context, dep, status string, duration time.Duration) { + if checkDuration == nil { + return // Metrics not initialized - graceful no-op + } + + checkDuration.Record(ctx, float64(duration.Milliseconds()), + metric.WithAttributes( + attribute.String("dep", dep), + attribute.String("status", status), + ), + ) +} + +// EmitCheckStatus increments the counter for a dependency check outcome. +// Labels: dep (dependency name), status (check outcome: up/down/degraded/skipped/n/a). +func EmitCheckStatus(ctx context.Context, dep, status string) { + if checkStatus == nil { + return // Metrics not initialized - graceful no-op + } + + checkStatus.Add(ctx, 1, + metric.WithAttributes( + attribute.String("dep", dep), + attribute.String("status", status), + ), + ) +} + +// EmitSelfProbeResult sets the gauge for a dependency's self-probe result. +// Label: dep (dependency name). +// Values: 1.0 (up), 0.0 (down). +// Called from RunSelfProbe() after startup health validation completes. +func EmitSelfProbeResult(ctx context.Context, dep string, up bool) { + if selfProbe == nil { + return // Metrics not initialized - graceful no-op + } + + v := 0.0 + if up { + v = 1.0 + } + + selfProbe.Record(ctx, v, + metric.WithAttributes( + attribute.String("dep", dep), + ), + ) +} + +// ResetMetricsForTest resets the metrics state for testing purposes. +// This allows tests to re-initialize metrics with a fresh meter provider. +// WARNING: This function is intended for testing only and should not be +// called in production code. +func ResetMetricsForTest() { + metricsOnce = sync.Once{} + checkDuration = nil + checkStatus = nil + selfProbe = nil +} diff --git a/internal/adapters/http/in/readyz/metrics_test.go b/internal/adapters/http/in/readyz/metrics_test.go new file mode 100644 index 0000000..6eea97a --- /dev/null +++ b/internal/adapters/http/in/readyz/metrics_test.go @@ -0,0 +1,399 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package readyz_test + +import ( + "context" + "testing" + "time" + + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" +) + +// setupTestMeterProvider creates a test meter provider with an in-memory reader. +func setupTestMeterProvider(t *testing.T) (*sdkmetric.MeterProvider, *sdkmetric.ManualReader) { + t.Helper() + reader := sdkmetric.NewManualReader() + provider := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + otel.SetMeterProvider(provider) + + return provider, reader +} + +// collectMetrics reads all metrics from the reader. +func collectMetrics(t *testing.T, reader *sdkmetric.ManualReader) *metricdata.ResourceMetrics { + t.Helper() + var rm metricdata.ResourceMetrics + err := reader.Collect(context.Background(), &rm) + require.NoError(t, err) + + return &rm +} + +// findHistogram searches for a histogram metric by name. +func findHistogram(rm *metricdata.ResourceMetrics, name string) *metricdata.Histogram[float64] { + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + if hist, ok := m.Data.(metricdata.Histogram[float64]); ok { + return &hist + } + } + } + } + + return nil +} + +// findCounter searches for a counter metric by name. +func findCounter(rm *metricdata.ResourceMetrics, name string) *metricdata.Sum[int64] { + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + if sum, ok := m.Data.(metricdata.Sum[int64]); ok { + return &sum + } + } + } + } + + return nil +} + +// findGauge searches for a gauge metric by name. +func findGauge(rm *metricdata.ResourceMetrics, name string) *metricdata.Gauge[float64] { + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + if gauge, ok := m.Data.(metricdata.Gauge[float64]); ok { + return &gauge + } + } + } + } + + return nil +} + +// getAttributeValue extracts an attribute value by key from a data point. +func getAttributeValue(attrs []any, key string) string { + for _, attr := range attrs { + if kv, ok := attr.(interface { + Key() string + Value() interface{ AsString() string } + }); ok { + if kv.Key() == key { + return kv.Value().AsString() + } + } + } + + return "" +} + +func TestInitMetrics_RegistersAllThreeMetrics(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + // Reset metrics for clean test + readyz.ResetMetricsForTest() + + // Act + err := readyz.InitMetrics() + require.NoError(t, err, "InitMetrics should not return an error") + + // Emit one of each metric type to verify they're registered + ctx := context.Background() + readyz.EmitCheckDuration(ctx, "test-dep", "up", 50*time.Millisecond) + readyz.EmitCheckStatus(ctx, "test-dep", "up") + readyz.EmitSelfProbeResult(ctx, "test-dep", true) + + // Collect metrics + rm := collectMetrics(t, reader) + + // Assert: All three metrics should be present + hist := findHistogram(rm, "readyz_check_duration_ms") + assert.NotNil(t, hist, "readyz_check_duration_ms histogram should be registered") + + counter := findCounter(rm, "readyz_check_status") + assert.NotNil(t, counter, "readyz_check_status counter should be registered") + + gauge := findGauge(rm, "selfprobe_result") + assert.NotNil(t, gauge, "selfprobe_result gauge should be registered") +} + +func TestEmitCheckDuration_RecordsHistogramWithCorrectLabels(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act + ctx := context.Background() + readyz.EmitCheckDuration(ctx, "mongodb", "up", 42*time.Millisecond) + readyz.EmitCheckDuration(ctx, "postgresql", "down", 2500*time.Millisecond) + + // Collect metrics + rm := collectMetrics(t, reader) + hist := findHistogram(rm, "readyz_check_duration_ms") + + // Assert + require.NotNil(t, hist, "Histogram should exist") + require.Len(t, hist.DataPoints, 2, "Should have two data points (mongodb and postgresql)") + + // Verify data points exist for each dependency + var mongoFound, pgFound bool + for _, dp := range hist.DataPoints { + dep, _ := dp.Attributes.Value("dep") + status, _ := dp.Attributes.Value("status") + + if dep.AsString() == "mongodb" && status.AsString() == "up" { + mongoFound = true + assert.Equal(t, uint64(1), dp.Count, "MongoDB should have 1 observation") + } + + if dep.AsString() == "postgresql" && status.AsString() == "down" { + pgFound = true + assert.Equal(t, uint64(1), dp.Count, "PostgreSQL should have 1 observation") + } + } + + assert.True(t, mongoFound, "MongoDB data point should exist with dep=mongodb, status=up") + assert.True(t, pgFound, "PostgreSQL data point should exist with dep=postgresql, status=down") +} + +func TestEmitCheckStatus_IncrementsCounterWithCorrectLabels(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act + ctx := context.Background() + readyz.EmitCheckStatus(ctx, "mongodb", "up") + readyz.EmitCheckStatus(ctx, "mongodb", "up") + readyz.EmitCheckStatus(ctx, "postgresql", "down") + + // Collect metrics + rm := collectMetrics(t, reader) + counter := findCounter(rm, "readyz_check_status") + + // Assert + require.NotNil(t, counter, "Counter should exist") + require.Len(t, counter.DataPoints, 2, "Should have two data points (mongodb:up, postgresql:down)") + + // Verify counts + var mongoUpCount, pgDownCount int64 + for _, dp := range counter.DataPoints { + dep, _ := dp.Attributes.Value("dep") + status, _ := dp.Attributes.Value("status") + + if dep.AsString() == "mongodb" && status.AsString() == "up" { + mongoUpCount = dp.Value + } + + if dep.AsString() == "postgresql" && status.AsString() == "down" { + pgDownCount = dp.Value + } + } + + assert.Equal(t, int64(2), mongoUpCount, "MongoDB up counter should be 2") + assert.Equal(t, int64(1), pgDownCount, "PostgreSQL down counter should be 1") +} + +func TestEmitSelfProbeResult_SetsGaugeToOne_WhenUp(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act + ctx := context.Background() + readyz.EmitSelfProbeResult(ctx, "mongodb", true) + + // Collect metrics + rm := collectMetrics(t, reader) + gauge := findGauge(rm, "selfprobe_result") + + // Assert + require.NotNil(t, gauge, "Gauge should exist") + require.Len(t, gauge.DataPoints, 1, "Should have one data point") + + dp := gauge.DataPoints[0] + dep, _ := dp.Attributes.Value("dep") + assert.Equal(t, "mongodb", dep.AsString(), "dep attribute should be mongodb") + assert.Equal(t, 1.0, dp.Value, "Gauge value should be 1.0 when up=true") +} + +func TestEmitSelfProbeResult_SetsGaugeToZero_WhenDown(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act + ctx := context.Background() + readyz.EmitSelfProbeResult(ctx, "postgresql", false) + + // Collect metrics + rm := collectMetrics(t, reader) + gauge := findGauge(rm, "selfprobe_result") + + // Assert + require.NotNil(t, gauge, "Gauge should exist") + require.Len(t, gauge.DataPoints, 1, "Should have one data point") + + dp := gauge.DataPoints[0] + dep, _ := dp.Attributes.Value("dep") + assert.Equal(t, "postgresql", dep.AsString(), "dep attribute should be postgresql") + assert.Equal(t, 0.0, dp.Value, "Gauge value should be 0.0 when up=false") +} + +func TestEmitFunctions_DoNotPanic_WhenMetricsNotInitialized(t *testing.T) { + // Arrange: Reset metrics so they are nil (not initialized) + readyz.ResetMetricsForTest() + + // Act & Assert: Functions should not panic when metrics are nil + ctx := context.Background() + + assert.NotPanics(t, func() { + readyz.EmitCheckDuration(ctx, "test", "up", time.Millisecond) + }, "EmitCheckDuration should not panic when metrics not initialized") + + assert.NotPanics(t, func() { + readyz.EmitCheckStatus(ctx, "test", "up") + }, "EmitCheckStatus should not panic when metrics not initialized") + + assert.NotPanics(t, func() { + readyz.EmitSelfProbeResult(ctx, "test", true) + }, "EmitSelfProbeResult should not panic when metrics not initialized") +} + +func TestInitMetrics_IsIdempotent(t *testing.T) { + // Arrange + provider, _ := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + + // Act: Call InitMetrics multiple times + err1 := readyz.InitMetrics() + require.NoError(t, err1) + + err2 := readyz.InitMetrics() + require.NoError(t, err2) + + err3 := readyz.InitMetrics() + require.NoError(t, err3) + + // Assert: No errors and no panics (sync.Once ensures single initialization) +} + +func TestEmitCheckDuration_RecordsDurationInMilliseconds(t *testing.T) { + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act: Emit duration of 100ms + ctx := context.Background() + readyz.EmitCheckDuration(ctx, "test-dep", "up", 100*time.Millisecond) + + // Collect metrics + rm := collectMetrics(t, reader) + hist := findHistogram(rm, "readyz_check_duration_ms") + + // Assert: Value should be recorded in milliseconds (100), not nanoseconds or seconds + require.NotNil(t, hist) + require.Len(t, hist.DataPoints, 1) + + dp := hist.DataPoints[0] + // Sum should be approximately 100 (milliseconds) + assert.InDelta(t, 100.0, dp.Sum, 0.1, "Duration should be recorded in milliseconds") +} + +func TestHistogramBuckets_CoverExpectedRange(t *testing.T) { + // This test verifies that the histogram bucket boundaries match the spec: + // [1, 5, 10, 25, 50, 100, 250, 500, 1000, 2000, 5000] + + // Arrange + provider, reader := setupTestMeterProvider(t) + defer func() { + _ = provider.Shutdown(context.Background()) + }() + + readyz.ResetMetricsForTest() + err := readyz.InitMetrics() + require.NoError(t, err) + + // Act: Emit values that span the expected bucket range + ctx := context.Background() + testDurations := []time.Duration{ + 500 * time.Microsecond, // < 1ms (below first bucket) + 3 * time.Millisecond, // 1-5ms bucket + 7 * time.Millisecond, // 5-10ms bucket + 15 * time.Millisecond, // 10-25ms bucket + 35 * time.Millisecond, // 25-50ms bucket + 75 * time.Millisecond, // 50-100ms bucket + 150 * time.Millisecond, // 100-250ms bucket + 300 * time.Millisecond, // 250-500ms bucket + 750 * time.Millisecond, // 500-1000ms bucket + 1500 * time.Millisecond, // 1000-2000ms bucket + 3000 * time.Millisecond, // 2000-5000ms bucket + 10000 * time.Millisecond, // > 5000ms (above last bucket) + } + + for i, d := range testDurations { + readyz.EmitCheckDuration(ctx, "test-dep", "up", d) + _ = i // use index to avoid compiler warning + } + + // Collect metrics + rm := collectMetrics(t, reader) + hist := findHistogram(rm, "readyz_check_duration_ms") + + // Assert: Histogram should have recorded all observations + require.NotNil(t, hist) + require.Len(t, hist.DataPoints, 1) + assert.Equal(t, uint64(len(testDurations)), hist.DataPoints[0].Count, + "All %d durations should be recorded", len(testDurations)) +} diff --git a/internal/adapters/http/in/routes.go b/internal/adapters/http/in/routes.go index 664f8d9..117f4a8 100644 --- a/internal/adapters/http/in/routes.go +++ b/internal/adapters/http/in/routes.go @@ -16,6 +16,7 @@ import ( "github.com/LerianStudio/flowker/internal/adapters/http/in/health" "github.com/LerianStudio/flowker/internal/adapters/http/in/middleware" providerconfiguration "github.com/LerianStudio/flowker/internal/adapters/http/in/provider_configuration" + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" "github.com/LerianStudio/flowker/internal/adapters/http/in/webhook" "github.com/LerianStudio/flowker/internal/adapters/http/in/workflow" libLog "github.com/LerianStudio/lib-commons/v5/commons/log" @@ -37,11 +38,13 @@ type RouteConfig struct { // NewRoutes creates the Fiber application with all routes configured // dbChecker should implement health.DatabaseChecker interface (e.g., bootstrap.DatabaseManager) +// readyzHandler is the canonical /readyz endpoint handler (must be mounted BEFORE auth middleware) func NewRoutes( lg libLog.Logger, tl *libOtel.Telemetry, swaggerCfg SwaggerConfig, dbChecker health.DatabaseChecker, + readyzHandler *readyz.Handler, routeCfg *RouteConfig, workflowHandler *workflow.Handler, catalogHandler *catalog.Handler, @@ -146,9 +149,13 @@ func NewRoutes( return nil, fmt.Errorf("failed to create health handler: %w", err) } - f.Get("/health", healthHandler.Health) // Combined health (uptime, version, checks) - f.Get("/health/live", healthHandler.Liveness) // Kubernetes liveness probe - f.Get("/health/ready", healthHandler.Readiness) // Kubernetes readiness probe + // /health - Combined health check (uptime, version, checks) + // Gates on selfProbeOK: returns 503 if startup self-probe did not pass. + f.Get("/health", healthHandler.Health) + + // /readyz - Kubernetes readiness probe (canonical contract) + // MUST be registered BEFORE auth middleware per Ring Standards + f.Get("/readyz", readyzHandler.Readyz) // Version f.Get("/version", libHTTP.Version) diff --git a/internal/adapters/http/in/routes_cors.go b/internal/adapters/http/in/routes_cors.go index 1b34884..ca68362 100644 --- a/internal/adapters/http/in/routes_cors.go +++ b/internal/adapters/http/in/routes_cors.go @@ -19,7 +19,7 @@ func getCORSAllowedOrigins(cfg *RouteConfig) string { // skipTelemetryPaths avoids instrumenting noisy endpoints. func skipTelemetryPaths(c *fiber.Ctx) bool { switch c.Path() { - case "/health", "/health/live", "/health/ready": + case "/health", "/readyz": return true default: return false diff --git a/internal/bootstrap/audit_database.go b/internal/bootstrap/audit_database.go index eb9ffe3..29df21b 100644 --- a/internal/bootstrap/audit_database.go +++ b/internal/bootstrap/audit_database.go @@ -8,6 +8,8 @@ import ( "context" "errors" "fmt" + "net/url" + "strings" libPostgres "github.com/LerianStudio/lib-commons/v5/commons/postgres" "github.com/jackc/pgx/v5/pgxpool" @@ -29,11 +31,34 @@ type AuditDBConfig struct { } // DSN returns the PostgreSQL connection string. +// Uses url.URL for proper encoding of special characters in credentials. +// Handles IPv6 literal hosts by bracketing them (e.g., [::1]:5432). func (c *AuditDBConfig) DSN() string { - return fmt.Sprintf( - "postgres://%s:%s@%s:%s/%s?sslmode=%s", - c.User, c.Password, c.Host, c.Port, c.DBName, c.SSLMode, - ) + // Format host with port, bracketing IPv6 literals + host := formatHostWithPort(c.Host, c.Port) + + u := &url.URL{ + Scheme: "postgres", + User: url.UserPassword(c.User, c.Password), + Host: host, + Path: "/" + c.DBName, + RawQuery: url.Values{ + "sslmode": {c.SSLMode}, + }.Encode(), + } + + return u.String() +} + +// formatHostWithPort formats a host and port for use in a URL. +// IPv6 literal addresses are bracketed (e.g., [::1]:5432). +func formatHostWithPort(host, port string) string { + // Check if host is an IPv6 literal (contains colon but not already bracketed) + if strings.Contains(host, ":") && !strings.HasPrefix(host, "[") { + return fmt.Sprintf("[%s]:%s", host, port) + } + + return fmt.Sprintf("%s:%s", host, port) } // AuditDatabaseManager manages PostgreSQL connection for the audit trail. @@ -153,6 +178,16 @@ func (m *AuditDatabaseManager) GetConfig() *AuditDBConfig { return m.config } +// GetSSLMode returns the SSL mode for TLS detection. +// Implements readyz.PostgreSQLConfig interface. +func (m *AuditDatabaseManager) GetSSLMode() string { + if m == nil || m.config == nil { + return "" + } + + return m.config.SSLMode +} + // runMigrations executes version-tracked migrations against the audit database. // Uses lib-commons v4's NewMigrator without AllowMultiStatements, since // golang-migrate's multi-statement parser breaks PL/pgSQL $$-quoted functions. diff --git a/internal/bootstrap/config.go b/internal/bootstrap/config.go index a229b75..8314438 100644 --- a/internal/bootstrap/config.go +++ b/internal/bootstrap/config.go @@ -7,7 +7,9 @@ package bootstrap import ( "context" "fmt" + "net/url" "os" + "strings" "time" "github.com/LerianStudio/flowker/internal/adapters/http/in" @@ -16,8 +18,10 @@ import ( httpdashboard "github.com/LerianStudio/flowker/internal/adapters/http/in/dashboard" httpexecution "github.com/LerianStudio/flowker/internal/adapters/http/in/execution" httpexecutorconfig "github.com/LerianStudio/flowker/internal/adapters/http/in/executor_configuration" + httphealth "github.com/LerianStudio/flowker/internal/adapters/http/in/health" httpMiddleware "github.com/LerianStudio/flowker/internal/adapters/http/in/middleware" httpproviderconfig "github.com/LerianStudio/flowker/internal/adapters/http/in/provider_configuration" + "github.com/LerianStudio/flowker/internal/adapters/http/in/readyz" httpwebhook "github.com/LerianStudio/flowker/internal/adapters/http/in/webhook" httpworkflow "github.com/LerianStudio/flowker/internal/adapters/http/in/workflow" mongodashboard "github.com/LerianStudio/flowker/internal/adapters/mongodb/dashboard" @@ -57,6 +61,8 @@ type Config struct { OtelDeploymentEnv string `env:"OTEL_RESOURCE_DEPLOYMENT_ENVIRONMENT"` OtelColExporterEndpoint string `env:"OTEL_EXPORTER_OTLP_ENDPOINT"` EnableTelemetry bool `env:"ENABLE_TELEMETRY"` + // Deployment mode: saas (TLS mandatory), byoc (TLS recommended), local (TLS optional) + DeploymentMode string `env:"DEPLOYMENT_MODE"` // MongoDB configuration MongoURI string `env:"MONGO_URI"` MongoDBName string `env:"MONGO_DB_NAME"` @@ -86,6 +92,12 @@ type Config struct { // InitServers initiate http server. // Returns an error if any configuration or initialization step fails. func InitServers() (*Service, error) { + // Wire handler function variables to avoid import cycles. + // These functions allow health and readyz handlers to access bootstrap state + // without creating circular dependencies. + httphealth.SelfProbeOKFunc = SelfProbeOK + readyz.IsDrainingFunc = IsDraining + cfg := &Config{} if err := libCommons.SetConfigFromEnvVars(cfg); err != nil { @@ -133,6 +145,22 @@ func InitServers() (*Service, error) { return nil, fmt.Errorf("failed to initialize telemetry: %w", err) } + // Initialize readyz metrics after telemetry is available. + // Non-fatal: service continues without metrics if initialization fails. + if err := readyz.InitMetrics(); err != nil { + logger.Log(ctx, libLog.LevelWarn, "Failed to initialize readyz metrics", libLog.Any("error.message", err.Error())) + } + + // Validate TLS for SaaS mode - MUST be called BEFORE any database connection opens. + // This is a hard-fail at startup: in DEPLOYMENT_MODE=saas, all database connections + // must use TLS. The service refuses to start without TLS in SaaS mode. + if err := ValidateSaaSTLS(TLSConfig{ + MongoURI: cfg.MongoURI, + PostgresDSN: buildAuditDSN(), + }); err != nil { + return nil, fmt.Errorf("TLS enforcement failed: %w", err) + } + // Initialize MongoDB connection manager dbManager := NewDatabaseManagerWithConfig(&MongoConfig{ URI: cfg.MongoURI, @@ -175,7 +203,7 @@ func InitServers() (*Service, error) { return nil, fmt.Errorf("AUDIT_DB_HOST is required: audit trail is mandatory for compliance") } - auditHandler, auditWriter, err := initAuditComponents(logger) + auditHandler, auditWriter, auditDBManager, err := initAuditComponents(logger) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to initialize audit components", libLog.Any("error.message", err.Error())) return nil, err @@ -183,6 +211,39 @@ func InitServers() (*Service, error) { logger.Log(ctx, libLog.LevelInfo, "Audit trail components initialized successfully") + // Initialize readyz handler with both database checkers + // MongoDB checker + mongoChecker := readyz.NewMongoDBChecker(dbManager, dbManager) + // PostgreSQL checker (audit database) + postgresChecker := readyz.NewPostgreSQLChecker(auditDBManager, auditDBManager) + + readyzHandler := readyz.NewHandler( + readyz.WithChecker(mongoChecker), + readyz.WithChecker(postgresChecker), + ) + + logger.Log(ctx, libLog.LevelInfo, "Readyz handler initialized with MongoDB and PostgreSQL checkers") + + // Run startup self-probe to emit initial selfprobe_result gauge metrics. + // This records the health state of each dependency at service startup. + // Also gates /health endpoint - returns 503 until all deps are healthy. + LogSelfProbeStart(logger) + + results := readyzHandler.RunSelfProbe(ctx) + allHealthy := true + + for _, r := range results { + LogSelfProbeResult(logger, SelfProbeResult{Name: r.Name, Status: r.Status, Error: r.Err}) + + // Status vocabulary: up/skipped/n/a are healthy; down/degraded are unhealthy + if r.Status != "up" && r.Status != "skipped" && r.Status != "n/a" { + allHealthy = false + } + } + + LogSelfProbeComplete(logger, allHealthy) + SetSelfProbeOK(allHealthy) + // Initialize provider configuration components providerConfigHandler, providerConfigRepo, err := initProviderConfigComponents(dbManager, executorCatalog, auditWriter, logger) if err != nil { @@ -259,7 +320,7 @@ func InitServers() (*Service, error) { return nil, fmt.Errorf("failed to create auth guard: plugin auth is enabled but auth client is unavailable") } - httpApp, err := in.NewRoutes(logger, telemetry, swaggerCfg, dbManager, routeCfg, workflowHandler, catalogHandler, executorConfigHandler, providerConfigHandler, executionHandler, dashboardHandler, auditHandler, webhookHandler, authGuard) + httpApp, err := in.NewRoutes(logger, telemetry, swaggerCfg, dbManager, readyzHandler, routeCfg, workflowHandler, catalogHandler, executorConfigHandler, providerConfigHandler, executionHandler, dashboardHandler, auditHandler, webhookHandler, authGuard) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create HTTP routes", libLog.Any("error.message", err.Error())) return nil, err @@ -739,9 +800,10 @@ type providerConfigListerAdapter struct { // initAuditComponents creates all audit-related repositories, queries, services and handlers. // It creates its own AuditDatabaseManager, connects to PostgreSQL, and sets up the full // audit read pipeline (queries + service facade + HTTP handler). +// Returns the handler, writer, and the database manager (needed for readyz health checks). func initAuditComponents( logger libLog.Logger, -) (*httpaudit.Handler, command.AuditWriter, error) { +) (*httpaudit.Handler, command.AuditWriter, *AuditDatabaseManager, error) { ctx := context.Background() // Create and connect audit database @@ -752,57 +814,57 @@ func initAuditComponents( if err := auditDBManager.Connect(connectCtx); err != nil { logger.Log(ctx, libLog.LevelError, "Failed to connect to audit database", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } // Create PostgreSQL audit repository auditRepo, err := pgaudit.NewPostgreSQLRepository(auditDBManager.GetPool()) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create audit repository", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } // Create audit writer command (used by all command handlers for mandatory audit trail) auditWriter, err := command.NewRecordAuditEventCommand(auditRepo) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create RecordAuditEventCommand", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } // Create query services searchQuery, err := query.NewSearchAuditLogsQuery(auditRepo) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create SearchAuditLogsQuery", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } getByIDQuery, err := query.NewGetAuditEntryByIDQuery(auditRepo) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create GetAuditEntryByIDQuery", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } verifyHashQuery, err := query.NewVerifyAuditHashChainQuery(auditRepo) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create VerifyAuditHashChainQuery", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } // Create audit service facade auditSvc, err := services.NewAuditService(searchQuery, getByIDQuery, verifyHashQuery) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create AuditService", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } // Create HTTP handler auditHandler, err := httpaudit.NewHandler(auditSvc) if err != nil { logger.Log(ctx, libLog.LevelError, "Failed to create audit handler", libLog.Any("error.message", err.Error())) - return nil, nil, err + return nil, nil, nil, err } - return auditHandler, auditWriter, nil + return auditHandler, auditWriter, auditDBManager, nil } // populateWebhookRegistry loads all active workflows from the database and @@ -867,3 +929,45 @@ func (a *providerConfigListerAdapter) ListActiveByProvider(ctx context.Context, return result.Items, nil } + +// buildAuditDSN constructs the audit database DSN from environment variables. +// Used by ValidateSaaSTLS to check TLS configuration before any connection is opened. +// Returns empty string if AUDIT_DB_HOST is not configured (dependency not used). +// +// Uses url.URL to properly escape special characters in credentials (e.g., @, /, :). +// Handles IPv6 literal hosts by bracketing them (e.g., [::1]:5432). +func buildAuditDSN() string { + host := getEnvOrDefault("AUDIT_DB_HOST", "") + if host == "" { + return "" + } + + port := getEnvOrDefault("AUDIT_DB_PORT", "5432") + + // Build DSN using url.URL for proper encoding of special characters in credentials + u := &url.URL{ + Scheme: "postgres", + User: url.UserPassword( + getEnvOrDefault("AUDIT_DB_USER", "flowker_audit"), + getEnvOrDefault("AUDIT_DB_PASSWORD", "flowker_audit"), + ), + Host: formatHostPort(host, port), + Path: "/" + getEnvOrDefault("AUDIT_DB_NAME", "flowker_audit"), + RawQuery: url.Values{ + "sslmode": {getEnvOrDefault("AUDIT_DB_SSL_MODE", "disable")}, + }.Encode(), + } + + return u.String() +} + +// formatHostPort formats a host and port for use in a URL. +// IPv6 literal addresses are bracketed (e.g., [::1]:5432). +func formatHostPort(host, port string) string { + // Check if host is an IPv6 literal (contains colon but not already bracketed) + if strings.Contains(host, ":") && !strings.HasPrefix(host, "[") { + return fmt.Sprintf("[%s]:%s", host, port) + } + + return fmt.Sprintf("%s:%s", host, port) +} diff --git a/internal/bootstrap/database.go b/internal/bootstrap/database.go index 464cfeb..86f286c 100644 --- a/internal/bootstrap/database.go +++ b/internal/bootstrap/database.go @@ -179,3 +179,23 @@ func (dm *DatabaseManager) IsConnected() bool { func (dm *DatabaseManager) GetConfig() *MongoConfig { return dm.config } + +// GetURI returns the MongoDB URI for TLS detection. +// Implements readyz.MongoDBConfig interface. +func (dm *DatabaseManager) GetURI() string { + if dm.config == nil { + return "" + } + + return dm.config.URI +} + +// GetTLSCACert returns the TLS CA certificate for TLS detection. +// Implements readyz.MongoDBConfig interface. +func (dm *DatabaseManager) GetTLSCACert() string { + if dm.config == nil { + return "" + } + + return dm.config.TLSCACert +} diff --git a/internal/bootstrap/http_server.go b/internal/bootstrap/http_server.go index 529b914..d1e9fc4 100644 --- a/internal/bootstrap/http_server.go +++ b/internal/bootstrap/http_server.go @@ -36,7 +36,23 @@ func NewHTTPServer(cfg *Config, app *fiber.App, logger libCommonsLog.Logger, tel } // Run runs the server. +// Registers graceful shutdown with drain coupling: when SIGTERM/SIGINT is received, +// IsDraining() becomes true immediately, causing /readyz to return 503. +// This signals Kubernetes to stop routing new traffic before the server shuts down. func (s *HTTPServer) Run(l *libCommons.Launcher) error { + // Register graceful shutdown with drain coupling. + // This sets IsDraining() = true immediately on signal receipt, + // then waits for the grace period before actually shutting down. + RegisterGracefulShutdown(GracefulShutdownConfig{ + App: s.app, + Logger: s.logger, + DrainGracePeriod: DefaultDrainGracePeriod, + OnShutdown: nil, // No additional cleanup needed; lib-commons handles telemetry flush + }) + + // Use lib-commons server manager for the actual startup and server management. + // Note: Our RegisterGracefulShutdown handles SIGTERM/SIGINT with drain coupling, + // so the server manager's graceful shutdown will be triggered by us calling App.Shutdown(). libCommonsServer.NewServerManager(nil, s.telemetry, s.logger). WithHTTPServer(s.app, s.serverAddress). StartWithGracefulShutdown() diff --git a/internal/bootstrap/selfprobe.go b/internal/bootstrap/selfprobe.go new file mode 100644 index 0000000..a8e601a --- /dev/null +++ b/internal/bootstrap/selfprobe.go @@ -0,0 +1,74 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package bootstrap + +import ( + "context" + "sync/atomic" + + libLog "github.com/LerianStudio/lib-commons/v5/commons/log" +) + +// selfProbeOK indicates whether the startup self-probe passed. +// Initialized to false; set to true only after all dependencies are verified healthy. +var selfProbeOK atomic.Bool + +func init() { + selfProbeOK.Store(false) // unhealthy until proven otherwise +} + +// SelfProbeOK returns whether the startup self-probe passed. +// Used by /health to gate liveness: if false, /health returns 503. +func SelfProbeOK() bool { + return selfProbeOK.Load() +} + +// SetSelfProbeOK sets the self-probe result. +// Should only be called once at startup after RunSelfProbe completes. +func SetSelfProbeOK(ok bool) { + selfProbeOK.Store(ok) +} + +// SelfProbeResult represents the result of a dependency's self-probe. +type SelfProbeResult struct { + Name string // Dependency name (e.g., "mongodb", "postgresql") + Status string // up, down, skipped + Error error // Non-nil when status is "down" +} + +// LogSelfProbeStart logs the start of the self-probe. +func LogSelfProbeStart(logger libLog.Logger) { + logger.Log(context.Background(), libLog.LevelInfo, "startup_self_probe_started", + libLog.String("probe", "self")) +} + +// LogSelfProbeResult logs a single dependency's probe result. +func LogSelfProbeResult(logger libLog.Logger, result SelfProbeResult) { + if result.Status == "up" || result.Status == "skipped" { + logger.Log(context.Background(), libLog.LevelInfo, "self_probe_check", + libLog.String("probe", "self"), + libLog.String("name", result.Name), + libLog.String("status", result.Status), + ) + } else { + logger.Log(context.Background(), libLog.LevelError, "self_probe_check", + libLog.String("probe", "self"), + libLog.String("name", result.Name), + libLog.String("status", result.Status), + libLog.Any("error.message", result.Error), + ) + } +} + +// LogSelfProbeComplete logs the completion of the self-probe. +func LogSelfProbeComplete(logger libLog.Logger, passed bool) { + if passed { + logger.Log(context.Background(), libLog.LevelInfo, "startup_self_probe_passed", + libLog.String("probe", "self")) + } else { + logger.Log(context.Background(), libLog.LevelError, "startup_self_probe_failed", + libLog.String("probe", "self")) + } +} diff --git a/internal/bootstrap/selfprobe_test.go b/internal/bootstrap/selfprobe_test.go new file mode 100644 index 0000000..cfa0ee9 --- /dev/null +++ b/internal/bootstrap/selfprobe_test.go @@ -0,0 +1,71 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package bootstrap_test + +import ( + "testing" + + "github.com/LerianStudio/flowker/internal/bootstrap" + "github.com/stretchr/testify/assert" +) + +func TestSelfProbeOK_InitialValue(t *testing.T) { + // Reset to known state for test isolation + bootstrap.SetSelfProbeOK(false) + + // Assert: Initial value should be false (unhealthy until proven otherwise) + assert.False(t, bootstrap.SelfProbeOK(), "SelfProbeOK should be false initially") +} + +func TestSetSelfProbeOK_SetsToTrue(t *testing.T) { + // Arrange: Start with false + bootstrap.SetSelfProbeOK(false) + + // Act: Set to true + bootstrap.SetSelfProbeOK(true) + + // Assert: Value should be true + assert.True(t, bootstrap.SelfProbeOK(), "SelfProbeOK should be true after SetSelfProbeOK(true)") +} + +func TestSetSelfProbeOK_SetsToFalse(t *testing.T) { + // Arrange: Start with true + bootstrap.SetSelfProbeOK(true) + + // Act: Set to false + bootstrap.SetSelfProbeOK(false) + + // Assert: Value should be false + assert.False(t, bootstrap.SelfProbeOK(), "SelfProbeOK should be false after SetSelfProbeOK(false)") +} + +func TestSelfProbeResult_StructFields(t *testing.T) { + // Test that SelfProbeResult struct has the expected fields + result := bootstrap.SelfProbeResult{ + Name: "mongodb", + Status: "up", + Error: nil, + } + + assert.Equal(t, "mongodb", result.Name) + assert.Equal(t, "up", result.Status) + assert.Nil(t, result.Error) +} + +func TestSelfProbeResult_WithError(t *testing.T) { + // Test SelfProbeResult with an error + err := assert.AnError + result := bootstrap.SelfProbeResult{ + Name: "postgresql", + Status: "down", + Error: err, + } + + assert.Equal(t, "postgresql", result.Name) + assert.Equal(t, "down", result.Status) + assert.Equal(t, err, result.Error) +} diff --git a/internal/bootstrap/shutdown.go b/internal/bootstrap/shutdown.go new file mode 100644 index 0000000..6a6cfd4 --- /dev/null +++ b/internal/bootstrap/shutdown.go @@ -0,0 +1,105 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package bootstrap + +import ( + "context" + "os" + "os/signal" + "sync/atomic" + "syscall" + "time" + + libLog "github.com/LerianStudio/lib-commons/v5/commons/log" + "github.com/gofiber/fiber/v2" +) + +// drainingState indicates whether the service is draining (shutting down). +// When true, /readyz returns 503 to stop K8s from routing new traffic. +var drainingState atomic.Bool + +func init() { + drainingState.Store(false) +} + +// IsDraining returns whether the service is in drain mode. +// Used by /readyz to short-circuit to 503 during graceful shutdown. +func IsDraining() bool { + return drainingState.Load() +} + +// SetDraining sets the draining state. +// Called when SIGTERM/SIGINT is received to signal graceful shutdown. +func SetDraining(draining bool) { + drainingState.Store(draining) +} + +// DefaultDrainGracePeriod is the default time to wait after setting draining +// before shutting down the server. +// Should be >= K8s periodSeconds * failureThreshold + buffer. +// Default: 12 seconds (covers 5s period * 2 failures + 2s buffer). +const DefaultDrainGracePeriod = 12 * time.Second + +// GracefulShutdownConfig configures graceful shutdown behavior. +type GracefulShutdownConfig struct { + App *fiber.App // The Fiber app to shutdown + Logger libLog.Logger // Logger for shutdown events + DrainGracePeriod time.Duration // Time to wait for K8s to observe 503 + OnShutdown func(ctx context.Context) error // Optional cleanup callback +} + +// RegisterGracefulShutdown sets up SIGTERM/SIGINT handling with drain coupling. +// This sets the drain state immediately on signal receipt, then waits for +// the grace period before actually shutting down the server. +// +// Flow: +// 1. SIGTERM/SIGINT received +// 2. Set draining state (IsDraining() returns true) +// 3. /readyz now returns 503 +// 4. Wait DrainGracePeriod for K8s to stop routing traffic +// 5. Shutdown the Fiber app +// 6. Run OnShutdown callback if provided +func RegisterGracefulShutdown(cfg GracefulShutdownConfig) { + if cfg.DrainGracePeriod == 0 { + cfg.DrainGracePeriod = DefaultDrainGracePeriod + } + + sig := make(chan os.Signal, 1) + signal.Notify(sig, syscall.SIGTERM, syscall.SIGINT) + + go func() { + <-sig + signal.Stop(sig) // Stop receiving signals to prevent leak + + cfg.Logger.Log(context.Background(), libLog.LevelInfo, "received_shutdown_signal", + libLog.String("state", "draining")) + + // Set draining state - /readyz will now return 503 + SetDraining(true) + + // Wait for K8s to observe 503 on /readyz and stop routing + cfg.Logger.Log(context.Background(), libLog.LevelInfo, "waiting_for_drain_grace_period", + libLog.String("grace_period", cfg.DrainGracePeriod.String())) + time.Sleep(cfg.DrainGracePeriod) + + // Shutdown the server + cfg.Logger.Log(context.Background(), libLog.LevelInfo, "shutting_down_server") + if err := cfg.App.Shutdown(); err != nil { + cfg.Logger.Log(context.Background(), libLog.LevelError, "server_shutdown_error", + libLog.Any("error.message", err)) + } + + // Run cleanup if provided + if cfg.OnShutdown != nil { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + if err := cfg.OnShutdown(ctx); err != nil { + cfg.Logger.Log(context.Background(), libLog.LevelError, "cleanup_error", + libLog.Any("error.message", err)) + } + } + }() +} diff --git a/internal/bootstrap/shutdown_test.go b/internal/bootstrap/shutdown_test.go new file mode 100644 index 0000000..d3e2594 --- /dev/null +++ b/internal/bootstrap/shutdown_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package bootstrap_test + +import ( + "testing" + "time" + + "github.com/LerianStudio/flowker/internal/bootstrap" + "github.com/stretchr/testify/assert" +) + +func TestIsDraining_InitialValue(t *testing.T) { + // Reset to known state for test isolation + bootstrap.SetDraining(false) + + // Assert: Initial value should be false + assert.False(t, bootstrap.IsDraining(), "IsDraining should be false initially") +} + +func TestSetDraining_SetsToTrue(t *testing.T) { + // Arrange: Start with false + bootstrap.SetDraining(false) + + // Act: Set to true + bootstrap.SetDraining(true) + + // Assert: Value should be true + assert.True(t, bootstrap.IsDraining(), "IsDraining should be true after SetDraining(true)") +} + +func TestSetDraining_SetsToFalse(t *testing.T) { + // Arrange: Start with true + bootstrap.SetDraining(true) + + // Act: Set to false + bootstrap.SetDraining(false) + + // Assert: Value should be false + assert.False(t, bootstrap.IsDraining(), "IsDraining should be false after SetDraining(false)") +} + +func TestDefaultDrainGracePeriod(t *testing.T) { + // Assert: Default grace period should be 12 seconds per Ring Standards + // (>= K8s periodSeconds * failureThreshold + buffer) + assert.Equal(t, 12*time.Second, bootstrap.DefaultDrainGracePeriod, + "DefaultDrainGracePeriod should be 12 seconds") +} + +func TestGracefulShutdownConfig_Fields(t *testing.T) { + // Test that GracefulShutdownConfig struct has the expected fields + cfg := bootstrap.GracefulShutdownConfig{ + App: nil, // fiber.App + Logger: nil, // libLog.Logger + DrainGracePeriod: 10 * time.Second, + OnShutdown: nil, + } + + assert.Equal(t, 10*time.Second, cfg.DrainGracePeriod) + assert.Nil(t, cfg.App) + assert.Nil(t, cfg.Logger) + assert.Nil(t, cfg.OnShutdown) +} diff --git a/internal/bootstrap/tls_detection.go b/internal/bootstrap/tls_detection.go new file mode 100644 index 0000000..c98873b --- /dev/null +++ b/internal/bootstrap/tls_detection.go @@ -0,0 +1,87 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package bootstrap + +import ( + "net/url" +) + +// DetectMongoTLS checks if a MongoDB connection URI has TLS enabled. +// Returns (true, nil) if TLS is configured, (false, nil) if not, (false, error) for parse failures. +// +// TLS is considered enabled if: +// - URI scheme is mongodb+srv:// (always uses TLS per MongoDB specification) +// - URI has tls=true query parameter +// - URI has ssl=true query parameter (legacy, equivalent to tls=true) +// +// IMPORTANT: This function uses url.Parse and url.Query().Get() to avoid +// anti-pattern #4 (substring matching like strings.Contains(uri, "tls=true")) +// which fails for URL-encoded parameters and matches false positives. +func DetectMongoTLS(uri string) (bool, error) { + if uri == "" { + return false, nil + } + + parsed, err := url.Parse(uri) + if err != nil { + return false, err + } + + // mongodb+srv scheme always uses TLS per MongoDB specification + // https://www.mongodb.com/docs/manual/reference/connection-string/#dns-seed-list-connection-format + if parsed.Scheme == "mongodb+srv" { + return true, nil + } + + // Check tls or ssl query parameter using url.Query().Get() + // This correctly handles URL-encoded parameters and parameter boundaries + query := parsed.Query() + if query.Get("tls") == "true" || query.Get("ssl") == "true" { + return true, nil + } + + return false, nil +} + +// DetectPostgresTLS checks if a PostgreSQL DSN has TLS enabled. +// Returns (true, nil) if TLS is configured, (false, nil) if not, (false, error) for parse failures. +// +// TLS is considered enabled ONLY for strict modes that GUARANTEE TLS: +// PostgreSQL sslmode values: +// - disable: No SSL/TLS +// - allow: Try non-TLS first, fallback to TLS (CAN connect without TLS) +// - prefer: Try TLS first, fallback to non-TLS (CAN connect without TLS) +// - require: Require TLS, no CA verification (ALWAYS TLS) +// - verify-ca: Require TLS with CA verification (ALWAYS TLS) +// - verify-full: Require TLS with CA + hostname verification (ALWAYS TLS) +// +// CRITICAL: Only "require", "verify-ca", "verify-full" guarantee TLS. +// "allow" and "prefer" can silently fallback to cleartext connections, +// which is unacceptable for SaaS deployments. +// +// IMPORTANT: This function uses url.Parse and url.Query().Get() to avoid +// anti-pattern #4 (substring matching like strings.Contains(dsn, "sslmode=")) +// which fails for URL-encoded parameters and matches false positives. +func DetectPostgresTLS(dsn string) (bool, error) { + if dsn == "" { + return false, nil + } + + parsed, err := url.Parse(dsn) + if err != nil { + return false, err + } + + sslmode := parsed.Query().Get("sslmode") + + // Only strict modes that GUARANTEE TLS are considered TLS-enabled. + // "allow" and "prefer" can fallback to cleartext, which is unacceptable for SaaS. + switch sslmode { + case "require", "verify-ca", "verify-full": + return true, nil + default: + return false, nil + } +} diff --git a/internal/bootstrap/tls_detection_test.go b/internal/bootstrap/tls_detection_test.go new file mode 100644 index 0000000..98ae0c8 --- /dev/null +++ b/internal/bootstrap/tls_detection_test.go @@ -0,0 +1,233 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +//go:build unit + +package bootstrap_test + +import ( + "testing" + + "github.com/LerianStudio/flowker/internal/bootstrap" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDetectMongoTLS(t *testing.T) { + testCases := []struct { + name string + uri string + expected bool + expectError bool + }{ + { + name: "empty URI returns false", + uri: "", + expected: false, + expectError: false, + }, + { + name: "plain mongodb URI without TLS", + uri: "mongodb://localhost:27017", + expected: false, + expectError: false, + }, + { + name: "mongodb URI with database without TLS", + uri: "mongodb://localhost:27017/flowker", + expected: false, + expectError: false, + }, + { + name: "mongodb URI with tls=true query parameter", + uri: "mongodb://localhost:27017/db?tls=true", + expected: true, + expectError: false, + }, + { + name: "mongodb URI with ssl=true query parameter (legacy)", + uri: "mongodb://localhost:27017/db?ssl=true", + expected: true, + expectError: false, + }, + { + name: "mongodb+srv scheme always uses TLS", + uri: "mongodb+srv://cluster.mongodb.net/db", + expected: true, + expectError: false, + }, + { + name: "mongodb+srv with explicit tls=true", + uri: "mongodb+srv://cluster.mongodb.net/db?tls=true", + expected: true, + expectError: false, + }, + { + name: "mongodb URI with tls=false", + uri: "mongodb://localhost:27017/db?tls=false", + expected: false, + expectError: false, + }, + { + name: "mongodb URI with URL-encoded parameters", + uri: "mongodb://localhost:27017/db?tls=true&authSource=admin", + expected: true, + expectError: false, + }, + { + name: "mongodb URI with multiple query params and tls last", + uri: "mongodb://localhost:27017/db?authSource=admin&tls=true", + expected: true, + expectError: false, + }, + { + name: "mongodb URI with credentials and tls", + uri: "mongodb://user:password@localhost:27017/db?tls=true", + expected: true, + expectError: false, + }, + { + name: "malformed URI returns error", + uri: "://invalid-uri", + expected: false, + expectError: true, + }, + { + name: "mongodb URI with replica set and tls", + uri: "mongodb://host1:27017,host2:27017/db?replicaSet=rs0&tls=true", + expected: true, + expectError: false, + }, + // Anti-pattern #4 regression test: ensure substring-ambiguous cases work correctly + { + name: "anti-pattern regression: tls parameter in path should not match", + uri: "mongodb://localhost:27017/tls_database", + expected: false, + expectError: false, + }, + { + name: "anti-pattern regression: tls=false with true in host", + uri: "mongodb://tls-true-host:27017/db?tls=false", + expected: false, + expectError: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result, err := bootstrap.DetectMongoTLS(tc.uri) + + if tc.expectError { + require.Error(t, err, "expected error for URI: %s", tc.uri) + } else { + require.NoError(t, err, "unexpected error for URI: %s", tc.uri) + } + + assert.Equal(t, tc.expected, result, "TLS detection mismatch for URI: %s", tc.uri) + }) + } +} + +func TestDetectPostgresTLS(t *testing.T) { + testCases := []struct { + name string + dsn string + expected bool + expectError bool + }{ + { + name: "empty DSN returns false", + dsn: "", + expected: false, + expectError: false, + }, + { + name: "postgres DSN without sslmode", + dsn: "postgres://localhost:5432/db", + expected: false, + expectError: false, + }, + { + name: "postgres DSN with sslmode=disable", + dsn: "postgres://localhost:5432/db?sslmode=disable", + expected: false, + expectError: false, + }, + { + name: "postgres DSN with sslmode=allow returns false (can fallback to cleartext)", + dsn: "postgres://localhost:5432/db?sslmode=allow", + expected: false, + expectError: false, + }, + { + name: "postgres DSN with sslmode=prefer returns false (can fallback to cleartext)", + dsn: "postgres://localhost:5432/db?sslmode=prefer", + expected: false, + expectError: false, + }, + { + name: "postgres DSN with sslmode=require", + dsn: "postgres://localhost:5432/db?sslmode=require", + expected: true, + expectError: false, + }, + { + name: "postgres DSN with sslmode=verify-ca", + dsn: "postgres://localhost:5432/db?sslmode=verify-ca", + expected: true, + expectError: false, + }, + { + name: "postgres DSN with sslmode=verify-full", + dsn: "postgres://localhost:5432/db?sslmode=verify-full", + expected: true, + expectError: false, + }, + { + name: "postgres DSN with credentials and sslmode", + dsn: "postgres://user:password@localhost:5432/db?sslmode=require", + expected: true, + expectError: false, + }, + { + name: "postgres DSN with multiple query params", + dsn: "postgres://localhost:5432/db?sslmode=require&connect_timeout=10", + expected: true, + expectError: false, + }, + { + name: "malformed DSN returns error", + dsn: "://invalid-dsn", + expected: false, + expectError: true, + }, + // Anti-pattern #4 regression test: ensure substring-ambiguous cases work correctly + { + name: "anti-pattern regression: sslmode in path should not match", + dsn: "postgres://localhost:5432/sslmode_database", + expected: false, + expectError: false, + }, + { + name: "anti-pattern regression: sslmode=disable with require in host", + dsn: "postgres://sslmode-require-host:5432/db?sslmode=disable", + expected: false, + expectError: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result, err := bootstrap.DetectPostgresTLS(tc.dsn) + + if tc.expectError { + require.Error(t, err, "expected error for DSN: %s", tc.dsn) + } else { + require.NoError(t, err, "unexpected error for DSN: %s", tc.dsn) + } + + assert.Equal(t, tc.expected, result, "TLS detection mismatch for DSN: %s", tc.dsn) + }) + } +} diff --git a/internal/bootstrap/tls_enforcement.go b/internal/bootstrap/tls_enforcement.go new file mode 100644 index 0000000..79a93f2 --- /dev/null +++ b/internal/bootstrap/tls_enforcement.go @@ -0,0 +1,79 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package bootstrap + +import ( + "fmt" + "os" + "strings" + + "github.com/LerianStudio/flowker/pkg" +) + +// TLSConfig holds connection information for TLS validation. +// Used by ValidateSaaSTLS to verify TLS is enabled before connections are opened. +type TLSConfig struct { + // MongoURI is the MongoDB connection URI. + MongoURI string + + // PostgresDSN is the PostgreSQL connection string. + PostgresDSN string +} + +// ValidateSaaSTLS validates that all database connections use TLS when +// DEPLOYMENT_MODE=saas. Returns an error if any connection is non-TLS. +// +// MUST be called BEFORE any database connection is opened. +// This is a hard-fail at startup - the service will not start without TLS in SaaS mode. +// +// Deployment modes: +// - saas: TLS MANDATORY for all DB connections +// - byoc: TLS recommended but not hard-enforced +// - local: TLS optional (developer workstation) +// - (unset): defaults to local behavior +// +// Error messages specify which dependency caused the violation for operator diagnostics. +func ValidateSaaSTLS(cfg TLSConfig) error { + mode := os.Getenv("DEPLOYMENT_MODE") + if !strings.EqualFold(mode, "saas") { + return nil // Only enforce in SaaS mode + } + + // Check MongoDB TLS + if cfg.MongoURI != "" { + tls, err := DetectMongoTLS(cfg.MongoURI) + if err != nil { + return fmt.Errorf("validate TLS for mongodb: %w", err) + } + + if !tls { + return pkg.ValidationError{ + EntityType: "bootstrap", + Code: "TLS_REQUIRED_MONGODB", + Title: "TLS Required for MongoDB", + Message: "DEPLOYMENT_MODE=saas: TLS required for mongodb but not configured. Use mongodb+srv:// scheme or add tls=true/ssl=true query parameter.", + } + } + } + + // Check PostgreSQL TLS + if cfg.PostgresDSN != "" { + tls, err := DetectPostgresTLS(cfg.PostgresDSN) + if err != nil { + return fmt.Errorf("validate TLS for postgresql: %w", err) + } + + if !tls { + return pkg.ValidationError{ + EntityType: "bootstrap", + Code: "TLS_REQUIRED_POSTGRESQL", + Title: "TLS Required for PostgreSQL", + Message: "DEPLOYMENT_MODE=saas: TLS required for postgresql but not configured. Use sslmode=require, sslmode=verify-ca, or sslmode=verify-full.", + } + } + } + + return nil +} diff --git a/internal/bootstrap/tls_enforcement_test.go b/internal/bootstrap/tls_enforcement_test.go new file mode 100644 index 0000000..05e9255 --- /dev/null +++ b/internal/bootstrap/tls_enforcement_test.go @@ -0,0 +1,240 @@ +// Copyright (c) 2026 Lerian Studio. All rights reserved. +// Use of this source code is governed by the Elastic License 2.0 +// that can be found in the LICENSE file. + +package bootstrap + +import ( + "os" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestValidateSaaSTLS(t *testing.T) { + // Save original env value and restore after test + originalMode := os.Getenv("DEPLOYMENT_MODE") + defer func() { + if originalMode == "" { + os.Unsetenv("DEPLOYMENT_MODE") + } else { + os.Setenv("DEPLOYMENT_MODE", originalMode) + } + }() + + tests := []struct { + name string + deploymentMode string + unsetMode bool // if true, DEPLOYMENT_MODE is unset (not empty string) + cfg TLSConfig + wantErr bool + errContains string + }{ + { + name: "saas mode with non-TLS MongoDB URI returns error", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=require", + }, + wantErr: true, + errContains: "mongodb", + }, + { + name: "saas mode with non-TLS PostgreSQL DSN returns error", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb+srv://user:pass@cluster.mongodb.net/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + }, + wantErr: true, + errContains: "postgresql", + }, + { + name: "saas mode with TLS enabled for all databases returns nil", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb+srv://user:pass@cluster.mongodb.net/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=verify-full", + }, + wantErr: false, + }, + { + name: "local mode with non-TLS DSNs returns nil (no enforcement)", + deploymentMode: "local", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + }, + wantErr: false, + }, + { + name: "byoc mode with non-TLS DSNs returns nil (no enforcement)", + deploymentMode: "byoc", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + }, + wantErr: false, + }, + { + name: "saas mode with empty DSN returns nil (dependency not configured)", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "", + PostgresDSN: "", + }, + wantErr: false, + }, + { + name: "saas mode with malformed MongoDB URI returns wrapped parse error", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "://invalid-uri", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=require", + }, + wantErr: true, + errContains: "mongodb", + }, + { + name: "saas mode with malformed PostgreSQL DSN returns wrapped parse error", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb+srv://user:pass@cluster.mongodb.net/flowker", + PostgresDSN: "://invalid-dsn", + }, + wantErr: true, + errContains: "postgresql", + }, + { + name: "DEPLOYMENT_MODE unset (defaults to local) with non-TLS DSNs returns nil", + unsetMode: true, + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + }, + wantErr: false, + }, + { + name: "saas mode with tls=true query param returns nil", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker?tls=true", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=require", + }, + wantErr: false, + }, + { + name: "saas mode with ssl=true (legacy) query param returns nil", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker?ssl=true", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=verify-ca", + }, + wantErr: false, + }, + { + name: "saas mode with only MongoDB configured (no PostgreSQL) validates MongoDB only", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", // non-TLS + PostgresDSN: "", // not configured + }, + wantErr: true, + errContains: "mongodb", + }, + { + name: "saas mode with only PostgreSQL configured (no MongoDB) validates PostgreSQL only", + deploymentMode: "saas", + cfg: TLSConfig{ + MongoURI: "", // not configured + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", // non-TLS + }, + wantErr: true, + errContains: "postgresql", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set or unset DEPLOYMENT_MODE based on test case + if tt.unsetMode { + os.Unsetenv("DEPLOYMENT_MODE") + } else { + os.Setenv("DEPLOYMENT_MODE", tt.deploymentMode) + } + + err := ValidateSaaSTLS(tt.cfg) + + if tt.wantErr { + require.Error(t, err, "expected error but got nil") + assert.True(t, strings.Contains(strings.ToLower(err.Error()), tt.errContains), + "error should mention %q, got: %s", tt.errContains, err.Error()) + } else { + require.NoError(t, err, "expected no error but got: %v", err) + } + }) + } +} + +func TestValidateSaaSTLS_EdgeCases(t *testing.T) { + originalMode := os.Getenv("DEPLOYMENT_MODE") + defer func() { + if originalMode == "" { + os.Unsetenv("DEPLOYMENT_MODE") + } else { + os.Setenv("DEPLOYMENT_MODE", originalMode) + } + }() + + t.Run("saas mode case insensitivity", func(t *testing.T) { + // DEPLOYMENT_MODE should be case-insensitive: "SAAS", "SaaS", "saas" all trigger enforcement + os.Setenv("DEPLOYMENT_MODE", "SAAS") + cfg := TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + } + + err := ValidateSaaSTLS(cfg) + // "SAAS" (uppercase) should trigger enforcement just like "saas" + assert.Error(t, err, "uppercase SAAS should trigger enforcement (case-insensitive)") + assert.Contains(t, strings.ToLower(err.Error()), "mongodb") + }) + + t.Run("empty deployment mode treated as non-saas", func(t *testing.T) { + os.Setenv("DEPLOYMENT_MODE", "") + cfg := TLSConfig{ + MongoURI: "mongodb://localhost:27017/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=disable", + } + + err := ValidateSaaSTLS(cfg) + assert.NoError(t, err, "empty DEPLOYMENT_MODE should not trigger enforcement") + }) + + t.Run("PostgreSQL sslmode=prefer is NOT considered TLS enabled (can fallback to cleartext)", func(t *testing.T) { + os.Setenv("DEPLOYMENT_MODE", "saas") + cfg := TLSConfig{ + MongoURI: "mongodb+srv://user:pass@cluster.mongodb.net/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=prefer", + } + + err := ValidateSaaSTLS(cfg) + assert.Error(t, err, "sslmode=prefer can fallback to cleartext, should NOT be considered TLS enabled") + assert.Contains(t, strings.ToLower(err.Error()), "postgresql") + }) + + t.Run("PostgreSQL sslmode=allow is NOT considered TLS enabled (can fallback to cleartext)", func(t *testing.T) { + os.Setenv("DEPLOYMENT_MODE", "saas") + cfg := TLSConfig{ + MongoURI: "mongodb+srv://user:pass@cluster.mongodb.net/flowker", + PostgresDSN: "postgres://user:pass@localhost:5432/audit?sslmode=allow", + } + + err := ValidateSaaSTLS(cfg) + assert.Error(t, err, "sslmode=allow can fallback to cleartext, should NOT be considered TLS enabled") + assert.Contains(t, strings.ToLower(err.Error()), "postgresql") + }) +} diff --git a/tests/e2e/main_test.go b/tests/e2e/main_test.go index 1087cec..cd995dd 100644 --- a/tests/e2e/main_test.go +++ b/tests/e2e/main_test.go @@ -109,7 +109,7 @@ func TestMain(m *testing.M) { go suite.service.Run() - if err := waitForServer(fmt.Sprintf("http://%s/health/live", serverAddr), 30*time.Second); err != nil { + if err := waitForServer(fmt.Sprintf("http://%s/health", serverAddr), 30*time.Second); err != nil { fmt.Printf("server not ready: %v\n", err) terminate(ctx) os.Exit(1) diff --git a/tests/integration/auth_test.go b/tests/integration/auth_test.go index b14c8a4..4d09b37 100644 --- a/tests/integration/auth_test.go +++ b/tests/integration/auth_test.go @@ -37,9 +37,8 @@ func TestAuth_PublicEndpoints_NoAuthRequired(t *testing.T) { name string path string }{ - {name: "liveness probe", path: "/health/live"}, - {name: "readiness probe", path: "/health/ready"}, - {name: "combined health", path: "/health"}, + {name: "liveness probe", path: "/health"}, + {name: "readiness probe", path: "/readyz"}, {name: "swagger docs", path: "/swagger/index.html"}, } diff --git a/tests/integration/health_test.go b/tests/integration/health_test.go index a87deee..01f7f7b 100644 --- a/tests/integration/health_test.go +++ b/tests/integration/health_test.go @@ -13,7 +13,9 @@ import ( func TestHealthEndpoints(t *testing.T) { client := httpClient() - urls := []string{"/health", "/health/live", "/health/ready"} + // /health is the liveness probe (gated by self-probe) + // /readyz is the readiness probe (checks all dependencies) + urls := []string{"/health", "/readyz"} for _, path := range urls { path := path // capture range variable diff --git a/tests/integration/main_test.go b/tests/integration/main_test.go index 49e681b..8870cf3 100644 --- a/tests/integration/main_test.go +++ b/tests/integration/main_test.go @@ -95,7 +95,7 @@ func TestMain(m *testing.M) { go suite.service.Run() - if err := waitForServer(fmt.Sprintf("http://%s/health/live", serverAddr), 30*time.Second); err != nil { + if err := waitForServer(fmt.Sprintf("http://%s/health", serverAddr), 30*time.Second); err != nil { fmt.Printf("server not ready: %v\n", err) terminate(ctx) os.Exit(1)