Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 79 additions & 34 deletions app/api/cron/security-behavioral-scan/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,60 +10,60 @@
* One log line per detected event so triage can pivot by user/key without
* having to re-run the query.
*
* Deployment: invoked by an external scheduler (Kubernetes CronJob)
* every 5 minutes. Authorized via `Authorization: Bearer $CRON_SECRET`,
* mirroring `app/api/cron/agentic-wallet-sweeper`. The endpoint fails
* closed when `CRON_SECRET` is unset -- there is no NODE_ENV dev/test
* bypass, so a prod container that boots with `NODE_ENV=test` (the
* misconfig the v2 review flagged) cannot accidentally open the
* endpoint. Local dev sets `CRON_SECRET` in `.env` to invoke via curl.
* Deployment: invoked by a Kubernetes CronJob every 5 minutes (the
* `security-behavioral-scan` job in `deploy/keeperhub/{prod,staging}/
* values.yaml`, which runs `deploy/scripts/reaper.sh` against this path).
* Authorized via `X-Service-Key: $SCHEDULER_SERVICE_API_KEY` through
* `authenticateInternalService`, the same mechanism the reaper CronJob
* uses -- so scheduling reuses the existing scheduler SSM key rather than
* provisioning a dedicated cron secret. The endpoint fails closed when no
* service key matches; there is no NODE_ENV dev/test bypass, so a prod
* container that boots with `NODE_ENV=test` (the misconfig the v2 review
* flagged) cannot accidentally open the endpoint. Local dev sets
* `SCHEDULER_SERVICE_API_KEY` in `.env` to invoke via curl.
*
* Detection windows are deliberately overlapping so a transient blip in
* scheduler timing doesn't drop an event. Idempotency comes from the
* downstream alert layer (Loki dedupe within the alert group window),
* not from this endpoint.
* scheduler timing doesn't drop an event: the CronJob fires every 5
* minutes but EXECUTION_LOOKBACK_MS is 10 minutes, so every execution is
* read by two consecutive scans and a single late/skipped run still
* leaves it covered. Idempotency comes from the downstream alert layer
* (the `new_account_first_workflow` Loki rule evaluates a 10-minute
* window and fires on >=1 occurrence, so the duplicate emissions collapse
* to a single page), not from this endpoint.
*/

import { captureMessage } from "@sentry/nextjs";
import { and, eq, gt, isNotNull } from "drizzle-orm";
import { db } from "@/lib/db";
import { users, workflowExecutions } from "@/lib/db/schema";
import { authenticateInternalService } from "@/lib/internal-service-auth";

export const dynamic = "force-dynamic";

const NEW_ACCOUNT_WINDOW_MS = 15 * 60 * 1000;
const EXECUTION_LOOKBACK_MS = 5 * 60 * 1000;
// Wider than the 5-minute CronJob interval so consecutive scans overlap and
// scheduler jitter cannot drop an execution from coverage. Matched to the
// 10-minute window of the downstream `new_account_first_workflow` Loki alert
// (relative_time_range from=600 in keeperhub-security-alerts.tf) so the
// resulting duplicate emissions dedupe to a single page.
const EXECUTION_LOOKBACK_MS = 10 * 60 * 1000;

type BehavioralScanResponse = {
newAccountFirstWorkflowEvents: number;
durationMs: number;
};

function isAuthorized(request: Request): boolean {
const expected = process.env.CRON_SECRET;
// Fail closed when CRON_SECRET is unset -- mirrors
// app/api/cron/agentic-wallet-sweeper. No NODE_ENV bypass; local dev
// sets CRON_SECRET in .env to invoke via curl.
if (!expected) {
return false;
}
return request.headers.get("authorization") === `Bearer ${expected}`;
}

export async function GET(request: Request): Promise<Response> {
const startedAt = Date.now();

if (!isAuthorized(request)) {
return Response.json({ error: "unauthorized" }, { status: 401 });
}

async function scanNewAccountFirstWorkflow(
startedAt: number
): Promise<BehavioralScanResponse> {
const now = new Date();
const accountFloor = new Date(now.getTime() - NEW_ACCOUNT_WINDOW_MS);
const executionFloor = new Date(now.getTime() - EXECUTION_LOOKBACK_MS);

// New-account-first-workflow: any execution within the last 5 minutes
// owned by a user whose account is newer than the 15-minute floor. The
// join captures the user's signup age so the alert can carry it.
// New-account-first-workflow: any execution within the EXECUTION_LOOKBACK_MS
// window (10 minutes) owned by a user whose account is newer than the
// 15-minute floor. The join captures the user's signup age so the alert
// can carry it.
const rows = await db
.select({
userId: workflowExecutions.userId,
Expand Down Expand Up @@ -126,9 +126,54 @@ export async function GET(request: Request): Promise<Response> {
);
}

const body: BehavioralScanResponse = {
return {
newAccountFirstWorkflowEvents: rows.length,
durationMs: Date.now() - startedAt,
};
return Response.json(body);
}

export async function GET(request: Request): Promise<Response> {
const startedAt = Date.now();

// Fail closed via the shared internal-service auth: the CronJob sends
// `X-Service-Key: $SCHEDULER_SERVICE_API_KEY` (see reaper.sh), which the
// legacy-bearer path resolves to caller "scheduler". Scoped to that caller
// specifically -- the mcp/events/hub/executor keys that also satisfy
// authenticateInternalService have no business invoking a detection scan,
// so least-privilege rejects them. No NODE_ENV bypass: when no key is
// configured nothing matches.
const auth = await authenticateInternalService(request);
if (!auth.authenticated || auth.caller !== "scheduler") {
return Response.json({ error: "unauthorized" }, { status: 401 });
}

try {
const body = await scanNewAccountFirstWorkflow(startedAt);
return Response.json(body);
} catch (error) {
// The scan itself failed (e.g. a DB error). Swallowing silently would
// drop detection to zero with no signal -- and reaper.sh reports a
// non-2xx as a "successful" job (curl -sS, no -f), so a broken scan
// would otherwise look healthy. Emit a self-failure signal (self-guarded,
// dual transport) so the detection layer going dark is itself observable,
// then surface a 500. Mirrors content-scanner's security.content_scanner_error.
const message = error instanceof Error ? error.message : String(error);
try {
captureMessage("security.behavioral.scan_error", {
level: "error",
tags: { security: "behavioral.scan_error" },
extra: { message, durationMs: Date.now() - startedAt },
});
} catch {
// never let the failure-signal emission mask the original error
}
console.error(
JSON.stringify({
event: "security.behavioral.scan_error",
message,
durationMs: Date.now() - startedAt,
})
);
return Response.json({ error: "scan_failed" }, { status: 500 });
}
}
27 changes: 27 additions & 0 deletions deploy/keeperhub/prod/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,30 @@ cronjob:
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
- name: security-behavioral-scan
image:
repository: ${ECR_REGISTRY}/keeperhub-prod
tag: app-${IMAGE_TAG}
imagePullPolicy: Always
schedule: "*/5 * * * *" # every 5 minutes
command: ["/bin/sh"]
args:
- "/app/deploy/scripts/reaper.sh"
- "http://keeperhub-common:3000/api/cron/security-behavioral-scan"
env:
SCHEDULER_SERVICE_API_KEY:
type: parameterStore
name: scheduler-service-api-key
parameter_name: /eks/techops-prod/keeperhub-scheduler/keeperhub-api-key
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "32Mi"
cpu: "10m"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
27 changes: 27 additions & 0 deletions deploy/keeperhub/staging/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -510,3 +510,30 @@ cronjob:
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
- name: security-behavioral-scan
image:
repository: ${ECR_REGISTRY}/keeperhub-staging
tag: app-${IMAGE_TAG}
imagePullPolicy: Always
schedule: "*/5 * * * *" # every 5 minutes
command: ["/bin/sh"]
args:
- "/app/deploy/scripts/reaper.sh"
- "http://keeperhub-common:3000/api/cron/security-behavioral-scan"
env:
SCHEDULER_SERVICE_API_KEY:
type: parameterStore
name: scheduler-service-api-key
parameter_name: /eks/techops-staging/keeperhub-scheduler/keeperhub-api-key
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "32Mi"
cpu: "10m"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
27 changes: 27 additions & 0 deletions deploy/pr-environment/values.template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -469,3 +469,30 @@ cronjob:
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
- name: security-behavioral-scan
image:
repository: ${ECR_REGISTRY}/keeperhub-staging
tag: app-${IMAGE_TAG}
imagePullPolicy: Always
schedule: "* * * * *" # every minute (faster than prod for PR validation)
command: ["/bin/sh"]
args:
- "/app/deploy/scripts/reaper.sh"
- "http://keeperhub-pr-${PR_NUMBER}-common:3000/api/cron/security-behavioral-scan"
env:
SCHEDULER_SERVICE_API_KEY:
type: parameterStore
name: scheduler-service-api-key
parameter_name: /eks/techops-staging/keeperhub-scheduler/keeperhub-api-key
resources:
requests:
memory: "32Mi"
cpu: "10m"
limits:
memory: "32Mi"
cpu: "10m"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 3
restartPolicy: OnFailure
backoffLimit: 2
Loading
Loading