diff --git a/.playground/nuxt.config.ts b/.playground/nuxt.config.ts index b4e412b2..7586e4b3 100644 --- a/.playground/nuxt.config.ts +++ b/.playground/nuxt.config.ts @@ -14,6 +14,13 @@ export default defineNuxtConfig({ */ defineNuxtModule({ setup(_, nuxt) { + nuxt.hooks.hook('robots:config', (config) => { + const catchAll = config.groups.find(g => g.userAgent.includes('*')) + if (catchAll) { + catchAll.disallow.push('/__link-checker__/') + } + console.log({ catchAll, groups: config.groups }) + }) if (!nuxt.options.dev) return diff --git a/.playground/pages/index.vue b/.playground/pages/index.vue index f44fe244..959c1fd5 100644 --- a/.playground/pages/index.vue +++ b/.playground/pages/index.vue @@ -1,9 +1,18 @@ + + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..a6742e48 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,61 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Development Commands + +- **Build**: `pnpm build` - Builds the module using nuxt-module-build and generates client +- **Development**: `pnpm dev` - Runs playground at `.playground` directory +- **Development Preparation**: `pnpm dev:prepare` - Prepares development environment with stub build +- **Test**: `pnpm test` - Runs vitest test suite +- **Lint**: `pnpm lint` - Runs ESLint with auto-fix using @antfu/eslint-config +- **Type Check**: `pnpm typecheck` - Runs TypeScript compiler for type checking +- **Client Development**: `pnpm client:dev` - Runs devtools UI client on port 3300 +- **Release**: `pnpm release` - Builds, bumps version, and publishes + +## Architecture Overview + +This is a Nuxt module (`@nuxtjs/robots`) that provides robots.txt generation and robot meta tag functionality for Nuxt applications. + +### Core Module Structure + +- **`src/module.ts`**: Main module entry point with module options and setup logic +- **`src/runtime/`**: Runtime code that gets injected into user applications + - **`app/`**: Client-side runtime (composables, plugins) + - **`server/`**: Server-side runtime (middleware, routes, composables) +- **`src/kit.ts`**: Utilities for build-time module functionality +- **`src/util.ts`**: Shared utilities exported to end users + +### Key Runtime Components + +- **Server Routes**: + - `/robots.txt` route handler in `src/runtime/server/routes/robots-txt.ts` + - Debug routes under `/__robots__/` for development +- **Server Composables**: `getSiteRobotConfig()` and `getPathRobotConfig()` for runtime robot configuration +- **Client Composables**: `useRobotsRule()` for accessing robot rules in Vue components +- **Meta Plugin**: Automatically injects robot meta tags and X-Robots-Tag headers + +### Build System + +- Uses `@nuxt/module-builder` with unbuild configuration in `build.config.ts` +- Exports multiple entry points: main module, `/util`, and `/content` +- Supports both ESM and CommonJS via rollup configuration + +### Test Structure + +- **Integration Tests**: Test fixtures in `test/fixtures/` with full Nuxt apps +- **Unit Tests**: Focused tests in `test/unit/` for specific functionality +- Uses `@nuxt/test-utils` for testing Nuxt applications +- Test environment automatically set to production mode + +### Development Workflow + +The module supports a playground at `.playground` for local development and manual testing. The client UI (devtools integration) is developed separately in the `client/` directory. + +### I18n Integration + +The module has special handling for i18n scenarios, with logic in `src/i18n.ts` for splitting paths and handling localized routes. + +### Content Integration + +Provides integration with Nuxt Content module via `src/content.ts` for content-based robot configurations. diff --git a/docs/content/3.api/1.config.md b/docs/content/3.api/0.config.md similarity index 100% rename from docs/content/3.api/1.config.md rename to docs/content/3.api/0.config.md diff --git a/docs/content/3.api/1.nuxt-hooks.md b/docs/content/3.api/robots-config.md similarity index 89% rename from docs/content/3.api/1.nuxt-hooks.md rename to docs/content/3.api/robots-config.md index 33313932..59c87195 100644 --- a/docs/content/3.api/1.nuxt-hooks.md +++ b/docs/content/3.api/robots-config.md @@ -1,10 +1,8 @@ --- -title: Nuxt Hooks +title: "Hook: robots:config" description: Learn how to use Nuxt hooks to modify the robots config. --- -## `'robots:config'`{lang="ts"} - **Type:** `(config: ResolvedModuleOptions) => void | Promise`{lang="ts"} This hook allows you to modify the robots config before it is used to generate the robots.txt and meta tags. diff --git a/libs/is-bot/.gitignore b/libs/is-bot/.gitignore new file mode 100644 index 00000000..99afced5 --- /dev/null +++ b/libs/is-bot/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +dist/ +*.log +.DS_Store +coverage/ +.nyc_output/ +*.tgz +*.tar.gz \ No newline at end of file diff --git a/libs/is-bot/README.md b/libs/is-bot/README.md new file mode 100644 index 00000000..a8d2a32a --- /dev/null +++ b/libs/is-bot/README.md @@ -0,0 +1,162 @@ +# Bot Detection Library + +A framework-agnostic bot detection library with advanced behavioral analysis capabilities. + +## Features + +- 🤖 **Advanced Bot Detection**: Multi-layered analysis including user agents, behavioral patterns, and timing analysis +- 🔧 **Framework Agnostic**: Works with any web framework through driver pattern +- 🚀 **H3/Nuxt Ready**: Built-in support for H3 events and Nuxt applications +- 📊 **Behavioral Analysis**: Modular system with simple, intermediate, and advanced detection behaviors +- 💾 **Flexible Storage**: Supports multiple storage backends through adapter pattern +- 🎯 **High Performance**: Optimized with batch operations and intelligent caching +- 🛡️ **Security Focused**: IP allowlists/blocklists, rate limiting, and threat detection + +## Installation + +```bash +npm install @nuxtjs/robots-bot-detection +``` + +## Quick Start + +### Basic Usage + +```typescript +import { BotDetectionEngine, MemoryAdapter, H3SessionIdentifier } from '@nuxtjs/robots-bot-detection' + +// Create storage adapter +const storage = new MemoryAdapter() + +// Create session identifier +const sessionIdentifier = new H3SessionIdentifier() + +// Create engine +const engine = new BotDetectionEngine({ + storage, + sessionIdentifier, + config: { + thresholds: { + likelyBot: 70, + definitelyBot: 90 + } + } +}) + +// Analyze a request +const request = { + path: '/api/data', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 ...' + }, + ip: '192.168.1.1', + timestamp: Date.now() +} + +const result = await engine.analyze(request) +console.log(`Bot score: ${result.score}`) +console.log(`Is bot: ${result.isBot}`) +``` + +### H3/Nuxt Integration + +```typescript +import { BotDetectionEngine, UnstorageBehaviorAdapter, H3SessionIdentifier } from '@nuxtjs/robots-bot-detection' +import { useStorage } from 'unstorage' + +const storage = useStorage('redis://localhost:6379') +const adapter = new UnstorageBehaviorAdapter(storage) +const sessionIdentifier = new H3SessionIdentifier('your-session-secret') + +const engine = new BotDetectionEngine({ + storage: adapter, + sessionIdentifier +}) + +// In your H3 handler +export default defineEventHandler(async (event) => { + const result = await engine.analyze(request, event) + + if (result.isBot) { + throw createError({ + statusCode: 429, + statusMessage: 'Too Many Requests' + }) + } + + // Continue with normal processing +}) +``` + +## API Reference + +### BotDetectionEngine + +The main engine class for bot detection. + +#### Constructor Options + +```typescript +interface BotDetectionEngineOptions { + storage: BehaviorStorage + sessionIdentifier: SessionIdentifier + responseStatusProvider?: ResponseStatusProvider + config?: BotDetectionConfig +} +``` + +#### Methods + +- `analyze(request: BotDetectionRequest, event?: H3Event): Promise` +- `updateConfig(config: Partial): void` +- `cleanup(): Promise` + +### Storage Adapters + +#### MemoryAdapter +In-memory storage for development and testing. + +#### UnstorageBehaviorAdapter +Production-ready storage adapter using unstorage. + +### Behavior Configuration + +Configure which detection behaviors to enable: + +```typescript +const config = { + behaviors: { + simple: { + pathAnalysis: { enabled: true, weight: 1.0 }, + basicTiming: { enabled: true, weight: 0.8 }, + basicRateLimit: { enabled: true, weight: 1.2 } + }, + intermediate: { + burstDetection: { enabled: true, weight: 1.0 }, + headerConsistency: { enabled: true, weight: 0.9 } + }, + advanced: { + advancedTiming: { enabled: false, weight: 1.5 }, + browserFingerprint: { enabled: false, weight: 1.3 } + } + } +} +``` + +## Testing + +```bash +# Run tests +npm test + +# Run tests with coverage +npm run test:coverage + +# Run tests in watch mode +npm run dev +``` + +## License + +MIT License - see LICENSE file for details. \ No newline at end of file diff --git a/libs/is-bot/package.json b/libs/is-bot/package.json new file mode 100644 index 00000000..237d69e5 --- /dev/null +++ b/libs/is-bot/package.json @@ -0,0 +1,70 @@ +{ + "name": "@nuxtjs/robots-bot-detection", + "version": "1.0.0", + "description": "Framework-agnostic bot detection library", + "type": "module", + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "require": "./dist/index.cjs" + }, + "./h3": { + "types": "./dist/drivers/h3.d.ts", + "import": "./dist/drivers/h3.js", + "require": "./dist/drivers/h3.cjs" + }, + "./behaviors": { + "types": "./dist/behaviors/index.d.ts", + "import": "./dist/behaviors/index.js", + "require": "./dist/behaviors/index.cjs" + } + }, + "files": [ + "dist", + "src" + ], + "scripts": { + "build": "tsup", + "dev": "tsup --watch", + "test": "vitest", + "test:run": "vitest run", + "test:coverage": "vitest run --coverage", + "typecheck": "tsc --noEmit", + "lint": "eslint src test --ext .ts,.js", + "lint:fix": "eslint src test --ext .ts,.js --fix" + }, + "keywords": [ + "bot-detection", + "security", + "web-scraping", + "rate-limiting", + "h3", + "nuxt", + "nitro" + ], + "author": "Nuxt Team", + "license": "MIT", + "dependencies": { + "unstorage": "^1.16.0" + }, + "peerDependencies": { + "h3": "^1.0.0" + }, + "devDependencies": { + "@types/node": "^20.19.4", + "eslint": "^9.30.1", + "h3": "^1.15.3", + "tsup": "^8.5.0", + "typescript": "^5.8.3", + "vitest": "^3.2.4" + }, + "repository": { + "type": "git", + "url": "https://github.com/nuxt-modules/robots.git", + "directory": "libs/is-bot" + } +} diff --git a/libs/is-bot/src/adapters/behavior-storage.ts b/libs/is-bot/src/adapters/behavior-storage.ts new file mode 100644 index 00000000..38c73817 --- /dev/null +++ b/libs/is-bot/src/adapters/behavior-storage.ts @@ -0,0 +1,113 @@ +// Adapter to bridge between framework-agnostic storage and behavior types +import type { Storage } from 'unstorage' +import type { SessionData, IPData } from '../behavior' +import type { SiteProfile } from '../types' + +export interface BehaviorStorage { + getSession(sessionId: string): Promise + setSession(sessionId: string, data: SessionData): Promise + getIP(ip: string): Promise + setIP(ip: string, data: IPData): Promise + getSiteProfile(): Promise + setSiteProfile(profile: SiteProfile): Promise + cleanup?(): Promise +} + +export class UnstorageBehaviorAdapter implements BehaviorStorage { + private storage: Storage + private prefix: string + private sessionTTL: number + private ipTTL: number + private siteProfileTTL: number + + constructor(storage: Storage, options: { + prefix?: string, + sessionTTL?: number, + ipTTL?: number, + siteProfileTTL?: number + } = {}) { + this.storage = storage + this.prefix = options.prefix || 'bot-detection' + this.sessionTTL = options.sessionTTL || 24 * 60 * 60 * 1000 // 24 hours default + this.ipTTL = options.ipTTL || 7 * 24 * 60 * 60 * 1000 // 7 days default + this.siteProfileTTL = options.siteProfileTTL || 30 * 24 * 60 * 60 * 1000 // 30 days default + } + + async getSession(sessionId: string): Promise { + const key = `${this.prefix}:session:${sessionId}` + const data = await this.storage.getItem(key) + + if (!data) return null + + // Check TTL + if (Date.now() - data.lastUpdated > this.sessionTTL) { + await this.storage.removeItem(key) + return null + } + + return data + } + + async setSession(sessionId: string, data: SessionData): Promise { + const key = `${this.prefix}:session:${sessionId}` + await this.storage.setItem(key, data) + } + + async getIP(ip: string): Promise { + const key = `${this.prefix}:ip:${this.sanitizeIP(ip)}` + const data = await this.storage.getItem(key) + + if (!data) return null + + // Check TTL + if (Date.now() - data.lastUpdated > this.ipTTL) { + await this.storage.removeItem(key) + return null + } + + return data + } + + async setIP(ip: string, data: IPData): Promise { + const key = `${this.prefix}:ip:${this.sanitizeIP(ip)}` + await this.storage.setItem(key, data) + } + + async getSiteProfile(): Promise { + const key = `${this.prefix}:site-profile` + return await this.storage.getItem(key) + } + + async setSiteProfile(profile: SiteProfile): Promise { + const key = `${this.prefix}:site-profile` + await this.storage.setItem(key, profile) + } + + async cleanup(): Promise { + const keys = await this.storage.getKeys(`${this.prefix}:`) + const now = Date.now() + + // Clean up expired sessions + const sessionKeys = keys.filter(key => key.includes(':session:')) + for (const key of sessionKeys) { + const data = await this.storage.getItem(key) + if (data && (now - data.lastUpdated > this.sessionTTL)) { + await this.storage.removeItem(key) + } + } + + // Clean up expired IP data + const ipKeys = keys.filter(key => key.includes(':ip:')) + for (const key of ipKeys) { + const data = await this.storage.getItem(key) + if (data && (now - data.lastUpdated > this.ipTTL)) { + await this.storage.removeItem(key) + } + } + } + + private sanitizeIP(ip: string): string { + // Replace : and . with - for safe key names + return ip.replace(/[:.]/g, '-') + } +} \ No newline at end of file diff --git a/libs/is-bot/src/adapters/h3.ts b/libs/is-bot/src/adapters/h3.ts new file mode 100644 index 00000000..05fbd35b --- /dev/null +++ b/libs/is-bot/src/adapters/h3.ts @@ -0,0 +1,131 @@ +// H3/Nuxt adapters for bot detection +import type { H3Event } from 'h3' +import { getHeaders, getRequestIP, getResponseStatus, useSession } from 'h3' +import type { + BotDetectionRequest, + SessionIdentifier, + ResponseStatusProvider +} from '../types' + +/** + * Convert H3Event to BotDetectionRequest + */ +export function h3ToBotDetectionRequest(event: H3Event): BotDetectionRequest { + const headers = getHeaders(event) + const ip = getRequestIP(event, { xForwardedFor: true }) || '127.0.0.1' + + return { + path: event.path || '/', + method: event.method || 'GET', + headers: headers as Record, + ip, + timestamp: Date.now() + } +} + +/** + * H3 Session Identifier using useSession + */ +export class H3SessionIdentifier implements SessionIdentifier { + private sessionPassword: string + + constructor(sessionPassword?: string) { + this.sessionPassword = sessionPassword || 'default-bot-detection-password' + } + + async getSessionId(request: BotDetectionRequest): Promise { + // This is a simplified version - in practice you'd need to pass the H3Event + // For now, we'll generate a session ID based on IP and user agent + const userAgent = Array.isArray(request.headers['user-agent']) + ? request.headers['user-agent'][0] + : request.headers['user-agent'] || '' + + // Simple hash for demo - in practice use proper session handling + const sessionKey = `${request.ip}-${this.simpleHash(userAgent)}` + return sessionKey + } + + private simpleHash(str: string): string { + let hash = 0 + for (let i = 0; i < str.length; i++) { + const char = str.charCodeAt(i) + hash = ((hash << 5) - hash) + char + hash = hash & hash // Convert to 32-bit integer + } + return Math.abs(hash).toString(36) + } +} + +/** + * H3 Session Identifier using actual H3 sessions + */ +export class H3RealSessionIdentifier implements SessionIdentifier { + private sessionPassword: string + + constructor(sessionPassword?: string) { + this.sessionPassword = sessionPassword || 'default-bot-detection-password' + } + + // Note: This requires the actual H3Event, so you'd need to modify the interface + async getSessionIdFromEvent(event: H3Event): Promise { + const session = await useSession(event, { + password: this.sessionPassword + }) + return session.id + } + + // Fallback implementation for the interface + getSessionId(request: BotDetectionRequest): string { + // Generate deterministic session ID from request data + const userAgent = Array.isArray(request.headers['user-agent']) + ? request.headers['user-agent'][0] + : request.headers['user-agent'] || '' + + return `${request.ip}-${this.simpleHash(userAgent)}` + } + + private simpleHash(str: string): string { + let hash = 0 + for (let i = 0; i < str.length; i++) { + const char = str.charCodeAt(i) + hash = ((hash << 5) - hash) + char + hash = hash & hash + } + return Math.abs(hash).toString(36) + } +} + +/** + * H3 Response Status Provider + */ +export class H3ResponseStatusProvider implements ResponseStatusProvider { + private eventMap = new WeakMap() + + // Register the H3Event for a request so we can get status later + registerEvent(request: BotDetectionRequest, event: H3Event) { + this.eventMap.set(request, event) + } + + getStatus(request: BotDetectionRequest): number | undefined { + const event = this.eventMap.get(request) + if (!event) return undefined + + try { + return getResponseStatus(event) + } catch { + return undefined + } + } +} + +/** + * Utility to create a BotDetectionRequest that maintains reference to H3Event + */ +export function createTrackedBotDetectionRequest( + event: H3Event, + statusProvider: H3ResponseStatusProvider +): BotDetectionRequest { + const request = h3ToBotDetectionRequest(event) + statusProvider.registerEvent(request, event) + return request +} \ No newline at end of file diff --git a/libs/is-bot/src/adapters/memory.ts b/libs/is-bot/src/adapters/memory.ts new file mode 100644 index 00000000..ca965f96 --- /dev/null +++ b/libs/is-bot/src/adapters/memory.ts @@ -0,0 +1,90 @@ +// In-memory storage adapter for bot detection (for testing/development) +import type { BotDetectionStorage, SessionData, IPData, SiteProfile } from '../types' + +export class MemoryAdapter implements BotDetectionStorage { + private sessions = new Map() + private ips = new Map() + private siteProfile: SiteProfile | null = null + private ttl: number + + constructor(options: { ttl?: number } = {}) { + this.ttl = options.ttl || 24 * 60 * 60 * 1000 // 24 hours default + } + + async getSession(sessionId: string): Promise { + const data = this.sessions.get(sessionId) + + if (!data) return null + + // Check TTL + if (Date.now() - data.lastUpdated > this.ttl) { + this.sessions.delete(sessionId) + return null + } + + return data + } + + async setSession(sessionId: string, data: SessionData): Promise { + this.sessions.set(sessionId, data) + } + + async getIP(ip: string): Promise { + const data = this.ips.get(ip) + + if (!data) return null + + // Check TTL + if (Date.now() - data.lastUpdated > this.ttl) { + this.ips.delete(ip) + return null + } + + return data + } + + async setIP(ip: string, data: IPData): Promise { + this.ips.set(ip, data) + } + + async getSiteProfile(): Promise { + return this.siteProfile + } + + async setSiteProfile(profile: SiteProfile): Promise { + this.siteProfile = profile + } + + async cleanup(): Promise { + const now = Date.now() + + // Clean up expired sessions + for (const [sessionId, data] of this.sessions.entries()) { + if (now - data.lastUpdated > this.ttl) { + this.sessions.delete(sessionId) + } + } + + // Clean up expired IP data + for (const [ip, data] of this.ips.entries()) { + if (now - data.lastUpdated > this.ttl) { + this.ips.delete(ip) + } + } + } + + // Development helpers + getStats() { + return { + sessions: this.sessions.size, + ips: this.ips.size, + hasSiteProfile: !!this.siteProfile + } + } + + clear() { + this.sessions.clear() + this.ips.clear() + this.siteProfile = null + } +} \ No newline at end of file diff --git a/libs/is-bot/src/adapters/unstorage.ts b/libs/is-bot/src/adapters/unstorage.ts new file mode 100644 index 00000000..83707ed0 --- /dev/null +++ b/libs/is-bot/src/adapters/unstorage.ts @@ -0,0 +1,134 @@ +// Unstorage adapter for bot detection storage +import type { Storage } from 'unstorage' +import type { BotDetectionStorage, SessionData, IPData, SiteProfile } from '../types' + +export class UnstorageAdapter implements BotDetectionStorage { + private storage: Storage + private prefix: string + private sessionTTL: number + private ipTTL: number + private siteProfileTTL: number + + constructor(storage: Storage, options: { + prefix?: string, + ttl?: number, + sessionTTL?: number, + ipTTL?: number, + siteProfileTTL?: number + } = {}) { + this.storage = storage + this.prefix = options.prefix || 'bot-detection' + this.sessionTTL = options.sessionTTL || options.ttl || 24 * 60 * 60 * 1000 // 24 hours default + this.ipTTL = options.ipTTL || options.ttl || 7 * 24 * 60 * 60 * 1000 // 7 days default + this.siteProfileTTL = options.siteProfileTTL || options.ttl || 30 * 24 * 60 * 60 * 1000 // 30 days default + } + + async getSession(sessionId: string): Promise { + const key = `${this.prefix}:session:${sessionId}` + const data = await this.storage.getItem(key) + + if (!data) return null + + // Check TTL + if (Date.now() - data.lastUpdated > this.sessionTTL) { + await this.storage.removeItem(key) + return null + } + + return data + } + + async setSession(sessionId: string, data: SessionData): Promise { + const key = `${this.prefix}:session:${sessionId}` + await this.storage.setItem(key, data) + } + + async getIP(ip: string): Promise { + const key = `${this.prefix}:ip:${this.sanitizeIP(ip)}` + const data = await this.storage.getItem(key) + + if (!data) return null + + // Check TTL and cleanup old sessions + const now = Date.now() + if (now - data.lastUpdated > this.ipTTL) { + // Clean up old sessions + data.activeSessions = data.activeSessions.filter(sessionId => { + // This is a simplification - in practice you'd check session TTL + return true + }) + + if (data.activeSessions.length === 0) { + await this.storage.removeItem(key) + return null + } + } + + return data + } + + async setIP(ip: string, data: IPData): Promise { + const key = `${this.prefix}:ip:${this.sanitizeIP(ip)}` + await this.storage.setItem(key, data) + } + + async getSiteProfile(): Promise { + const key = `${this.prefix}:site-profile` + const data = await this.storage.getItem(key) + + if (!data) return null + + // Reconstruct Set and Map objects + if (data.existingPaths && Array.isArray(data.existingPaths)) { + data.existingPaths = new Set(data.existingPaths as any) + } + + if (data.userAgentPatterns && typeof data.userAgentPatterns === 'object') { + data.userAgentPatterns = new Map(Object.entries(data.userAgentPatterns as any)) + } + + return data + } + + async setSiteProfile(profile: SiteProfile): Promise { + const key = `${this.prefix}:site-profile` + + // Serialize Set and Map objects for storage + const serializable = { + ...profile, + existingPaths: Array.from(profile.existingPaths), + userAgentPatterns: Object.fromEntries(profile.userAgentPatterns) + } + + await this.storage.setItem(key, serializable) + } + + async cleanup(): Promise { + // Get all keys with our prefix + const keys = await this.storage.getKeys(`${this.prefix}:`) + const now = Date.now() + + // Clean up expired sessions + const sessionKeys = keys.filter(key => key.includes(':session:')) + for (const key of sessionKeys) { + const data = await this.storage.getItem(key) + if (data && (now - data.lastUpdated > this.sessionTTL)) { + await this.storage.removeItem(key) + } + } + + // Clean up expired IP data + const ipKeys = keys.filter(key => key.includes(':ip:')) + for (const key of ipKeys) { + const data = await this.storage.getItem(key) + if (data && (now - data.lastUpdated > this.ipTTL)) { + await this.storage.removeItem(key) + } + } + } + + private sanitizeIP(ip: string): string { + // Replace : and . with - for safe key names + return ip.replace(/[:.]/g, '-') + } +} \ No newline at end of file diff --git a/libs/is-bot/src/behavior.ts b/libs/is-bot/src/behavior.ts new file mode 100644 index 00000000..54a4ee71 --- /dev/null +++ b/libs/is-bot/src/behavior.ts @@ -0,0 +1,843 @@ +// Bot detection behavior analysis - framework agnostic + +// Common sensitive paths that bots target - expanded with more patterns +export const SENSITIVE_PATHS = [ + '/wp-login', + '/xmlrpc.php', + '/.env', + '/phpmyadmin', + '/setup', + '/install', + '/config', + '/.git', + '/.svn', + '/api/graphql', + '/graphql', + // Additional common bot targets + '/wp-content', + '/wp-includes', + '/wp-json', + '/.well-known/security.txt', + '/vendor/', + '/server-status', + '/solr/', + '/jenkins/', + '/.DS_Store', + '/actuator/', + '/console/', + '/wp-admin', + '/admin-login.php', + '/wp-login-hidden', +] + +// Honeypot/high-sensitivity paths - these could be legitimate in some cases +// but are frequently targeted by bots and rarely used by regular users +export const MAYBE_SENSITIVE_PATHS = [ + '/admin', + '/login', + '/administrator', + '/includes/config', + '/.hidden-login', + '/robots.txt.bak', + '/administrator/index.php', + '/myadmin', + '/admin_area', + '/panel', + '/cpanel', + '/dashboard', +] + +// Enhanced bot detection score thresholds with an intermediate level +export const BOT_SCORE_THRESHOLDS = { + DEFINITELY_BOT: 90, + LIKELY_BOT: 70, + SUSPICIOUS: 40, + PROBABLY_HUMAN: 20, + DEFINITELY_HUMAN: 5, +} + +// Configuration setter functions for external framework integration +export function setBotScoreThresholds(thresholds: { + definitelyBot?: number + likelyBot?: number + suspicious?: number +}) { + if (thresholds.definitelyBot !== undefined) { + BOT_SCORE_THRESHOLDS.DEFINITELY_BOT = thresholds.definitelyBot + } + if (thresholds.likelyBot !== undefined) { + BOT_SCORE_THRESHOLDS.LIKELY_BOT = thresholds.likelyBot + } + if (thresholds.suspicious !== undefined) { + BOT_SCORE_THRESHOLDS.SUSPICIOUS = thresholds.suspicious + } +} + +export function addCustomSensitivePaths(paths: string[]) { + SENSITIVE_PATHS.push(...paths) +} + +// Updated behavior weights with increased penalties for timing issues +export const BEHAVIOR_WEIGHTS = { + SENSITIVE_PATH: 15, // Accessing known sensitive paths + MAYBE_SENSITIVE_PATH: 5, // Accessing potentially sensitive paths (honeypot/admin areas) + RAPID_REQUESTS: 20, // Too many requests in short time + REPEATED_ERRORS: 15, // Repeated 404s or errors + UNUSUAL_PATTERN: 25, // Unusual access pattern + NONEXISTENT_RESOURCES: 10, // Requesting resources that don't exist + REQUEST_CONSISTENCY: 20, // Consistency in request patterns + MULTIPLE_SENSITIVE_HITS: 40, // Multiple hits to different sensitive paths + RESOURCE_TIMING: 25, // Abnormal timing between resource requests (increased from 15) + SESSION_ANOMALY: 30, // Suspicious session behavior +} + +// Traffic classification - helps distinguish between different user types +export enum TrafficType { + REGULAR_USER = 'regular_user', + SUSPICIOUS = 'suspicious_bot', + MALICIOUS_BOT = 'malicious_bot', + UNKNOWN = 'unknown', +} + +// Enhanced session data with more behavioral indicators +export interface SessionData { + lastRequests: Array<{ + timestamp: number + path: string + status?: number + timeSincePrevious?: number + method?: string + }> + suspiciousPathHits: number + maybeSensitivePathHits: number + uniqueSensitivePathsAccessed: string[] // Track unique sensitive paths accessed + errorCount: number + score: number + lastScore: number + lastUpdated: number + trafficType: TrafficType + knownGoodActions: number // Count of actions that indicate human behavior + tempExemptUntil?: number // Timestamp for temporary exemption + requestMethodVariety: string[] // Array of used HTTP methods + averageTimeBetweenRequests?: number + requestSequenceEntropy: number // Measure of randomness in request sequence + firstSeenAt: number // When the session was first created + behaviorChangePoints?: number[] // Timestamps where behavior significantly changed +} + +export interface IPData { + sessionCount: number + activeSessions: string[] // Track active session IDs + suspiciousScore: number + lastUpdated: number + legitSessionsCount: number // Count of sessions that passed human verification + sessionsPerHour?: number // Rate of new sessions creation + lastSessionCreated?: number // Timestamp of the last session created+ + isBot?: boolean + isBotConfidence?: number + details?: { name: string, type: string, trusted?: boolean } | null + factores: string[] // List of factors that contributed to the score +} + +// Helper to check if path is in the maybe-sensitive category +export function isMaybeSensitivePath(path: string): boolean { + return MAYBE_SENSITIVE_PATHS.some(sp => path.includes(sp)) +} + +// Calculate entropy of request sequences to detect non-human patterns +function calculateRequestEntropy(paths: string[]): number { + if (paths.length < 3) + return 0 + + // Count occurrences of each path + const pathCounts: Record = {} + for (const path of paths) { + pathCounts[path] = (pathCounts[path] || 0) + 1 + } + + // Calculate entropy + let entropy = 0 + const totalPaths = paths.length + + for (const path in pathCounts) { + const probability = pathCounts[path] / totalPaths + entropy -= probability * Math.log2(probability) + } + + return entropy +} + +// Detect if a request sequence matches natural browsing patterns +function analyzeRequestSequence(requests: Array<{ path: string, timestamp: number }>): { + isNaturalBrowsing: boolean + entropy: number + timeConsistency: number +} { + if (requests.length < 5) { + return { isNaturalBrowsing: true, entropy: 0, timeConsistency: 1 } + } + + // 1. Check time intervals between requests + const intervals: number[] = [] + + // Sort requests by timestamp (oldest to newest) for interval calculation + const sortedRequests = [...requests].sort((a, b) => a.timestamp - b.timestamp) + + for (let i = 1; i < sortedRequests.length; i++) { + intervals.push(sortedRequests[i].timestamp - sortedRequests[i - 1].timestamp) + } + + // Calculate variance in intervals + const avgInterval = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - avgInterval) ** 2, 0) / intervals.length + + // Natural browsing has some variance in timing + const timeConsistency = Math.min(1, variance / (avgInterval * avgInterval)) + + // 2. Calculate path entropy + const paths = requests.map(r => r.path) + const entropy = calculateRequestEntropy(paths) + + // 3. Multiple indicators of unnatural browsing patterns + let suspiciousPatternCount = 0 + + // A. Check for alphabetical or sequential scanning patterns + const pathsInOrder = sortedRequests.map(r => r.path) + let sequentialOrdering = true + + for (let i = 1; i < pathsInOrder.length; i++) { + // If paths don't progress in a way that suggests scanning, it's more natural + if (pathsInOrder[i].localeCompare(pathsInOrder[i - 1]) < 0) { + sequentialOrdering = false + break + } + } + + if (sequentialOrdering && pathsInOrder.length >= 4) { + suspiciousPatternCount++ + } + + // B. Check for common prefix patterns (like crawling similar endpoints) + const pathPrefixes = new Map() + for (const path of paths) { + // Extract the first segment of the path (e.g., "/admin" from "/admin/users") + const prefix = `/${path.split('/')[1]}` + pathPrefixes.set(prefix, (pathPrefixes.get(prefix) || 0) + 1) + } + + // If 80% of requests are to the same prefix, it's suspicious + for (const [, count] of pathPrefixes.entries()) { + if (count >= Math.ceil(paths.length * 0.8) && paths.length >= 4) { + suspiciousPatternCount++ + break + } + } + + // C. Look for numeric incrementation patterns in paths (like id scanning) + const hasNumericPattern = paths.some((path) => { + const matches = path.match(/(\d+)/g) + return matches && matches.length > 0 + }) + + if (hasNumericPattern) { + let numericSequential = true + const numericValues: number[] = [] + + // Extract numeric values from paths + for (const path of pathsInOrder) { + const matches = path.match(/(\d+)/g) + if (matches && matches.length > 0) { + numericValues.push(Number.parseInt(matches[0], 10)) + } + else { + numericSequential = false + break + } + } + + // Check if numeric values are sequential or have a pattern + if (numericSequential && numericValues.length >= 3) { + let hasPattern = true + const diff = numericValues[1] - numericValues[0] + + for (let i = 2; i < numericValues.length; i++) { + if (numericValues[i] - numericValues[i - 1] !== diff) { + hasPattern = false + break + } + } + + if (hasPattern) { + suspiciousPatternCount++ + } + } + } + + // D. Check for consistent path length/structure (indicative of automation) + const pathLengths = paths.map(p => p.length) + const avgLength = pathLengths.reduce((sum, len) => sum + len, 0) / pathLengths.length + const lengthVariance = pathLengths.reduce((sum, len) => sum + (len - avgLength) ** 2, 0) / pathLengths.length + + // If path lengths are very consistent, it's suspicious + if (Math.sqrt(lengthVariance) / avgLength < 0.1 && paths.length >= 4) { + suspiciousPatternCount++ + } + + // Combine indicators to determine if this is natural browsing + const isNaturalBrowsing = ( + // Either high entropy (varied paths) and some time variance + (entropy > 1.5 && timeConsistency > 0.2) + // Or no suspicious patterns detected + || suspiciousPatternCount === 0 + ) + + return { + isNaturalBrowsing, + entropy, + timeConsistency, + } +} + +// Helper function to identify adaptive rate limits based on client history +function calculateRateLimit(sessionData: SessionData): number { + // Base rate limit - start with a moderate default + let rateLimit = 15 // requests per minute + + // Adjust based on client behavior + if (sessionData.trafficType === TrafficType.REGULAR_USER) { + // Regular users can have higher bursts during normal browsing + rateLimit = 30 + } + else if (sessionData.suspiciousPathHits > 0) { + // Clients that hit suspicious paths get stricter limits + rateLimit = Math.max(5, rateLimit - sessionData.suspiciousPathHits * 2) + } + + // Adjust for known good behavior + if (sessionData.knownGoodActions > 5) { + // Clients with good history get more flexibility + rateLimit += Math.min(20, sessionData.knownGoodActions) + } + + return rateLimit +} + +// Detect potential session hijacking or cookie theft +function detectSessionAnomaly(ipData: IPData, sessionData: SessionData, timestamp: number = Date.now()): { + suspicious: boolean + severity: number + reason?: string +} { + const result = { + suspicious: false, + severity: 0, + reason: '', + } + + const SESSION_AGE_THRESHOLD = 24 * 60 * 60 * 1000 // 24 hours in milliseconds + const now = timestamp + + // Check for IP with many sessions + if (ipData.activeSessions.length > 10) { + result.suspicious = true + result.severity = Math.min(70, ipData.activeSessions.length * 5) + result.reason = 'MANY_SESSIONS' + return result + } + + // Check for high session creation rate + if (ipData.sessionsPerHour && ipData.sessionsPerHour > 5) { + result.suspicious = true + result.severity = Math.min(60, ipData.sessionsPerHour * 10) + result.reason = 'RAPID_SESSION_CREATION' + return result + } + + // Check for abrupt behavior changes in old sessions + if (sessionData.firstSeenAt && (now - sessionData.firstSeenAt > SESSION_AGE_THRESHOLD)) { + // Old session with sudden suspicious activity + if (sessionData.suspiciousPathHits > 0 && sessionData.lastRequests.length > 5) { + // Check if suspicious behavior started recently + const recentRequests = sessionData.lastRequests.slice(-5) + const olderRequests = sessionData.lastRequests.slice(0, -5) + + // Calculate scores for both segments + const recentSuspicious = recentRequests.filter(r => + SENSITIVE_PATHS.some(sp => r.path.includes(sp)) + || MAYBE_SENSITIVE_PATHS.some(sp => r.path.includes(sp)), + ).length + + const olderSuspicious = olderRequests.filter(r => + SENSITIVE_PATHS.some(sp => r.path.includes(sp)) + || MAYBE_SENSITIVE_PATHS.some(sp => r.path.includes(sp)), + ).length + + // If recent behavior is much more suspicious than older behavior + if (recentSuspicious > 0 && (olderSuspicious === 0 || recentSuspicious / olderRequests.length > olderSuspicious / olderRequests.length * 3)) { + result.suspicious = true + result.severity = 50 + result.reason = 'BEHAVIOR_CHANGE' + + // Mark this point as a behavior change point + if (!sessionData.behaviorChangePoints) { + sessionData.behaviorChangePoints = [] + } + sessionData.behaviorChangePoints.push(now) + } + } + } + + return result +} + +export interface DetectionFactor { + type: string + weight: number + evidence: any + timestamp: number + description: string +} + +export interface DebugInfo { + sessionAge: number + requestCount: number + pathHistory: string[] + timingAnalysis: { + avgInterval: number + consistency: number + entropy: number + } + factors: DetectionFactor[] + ipInfo: { + sessionCount: number + totalScore: number + isBlocked: boolean + isTrusted: boolean + } + confidence: number + reasoning: string[] + enhancedAnalysis?: any + modularAnalysis?: any +} + +export interface BotDetectionBehavior { + id: string + session: SessionData + ip: IPData + dirty?: boolean + debug?: DebugInfo +} + +export function analyzeSessionAndIpBehavior({ + request, + behavior, + timestamp = Date.now(), + debug = false, +}: { + request: { path: string; method: string } + behavior: BotDetectionBehavior + timestamp?: number + debug?: boolean +}) { + // Configuration should be set externally via setBotScoreThresholds() + + const path = request.path || '' + const method = request.method || 'GET' + // Check if this is a maybe-sensitive path + const isMaybeSensitive = isMaybeSensitivePath(path) + const now = timestamp + + // Initialize or get session data with improved defaults + const sessionData: SessionData = behavior.session + + // Initialize or get IP data with improved defaults + const ipData: IPData = behavior.ip + + // Initialize debug tracking + const detectionFactors: DetectionFactor[] = [] + const reasoning: string[] = [] + + function addFactor(type: string, weight: number, evidence: any, description: string) { + const factor: DetectionFactor = { + type, + weight, + evidence, + timestamp: now, + description, + } + detectionFactors.push(factor) + if (debug) { + reasoning.push(`${type}: ${description} (weight: ${weight})`) + } + return weight + } + + // Calculate scoring factors + const scoreFactors: Record = {} + + // Check for maybe-sensitive path access + if (isMaybeSensitive) { + sessionData.maybeSensitivePathHits = (sessionData.maybeSensitivePathHits || 0) + 1 + + // Track unique sensitive paths for detecting scanning behavior + sessionData.uniqueSensitivePathsAccessed = sessionData.uniqueSensitivePathsAccessed || [] + if (!sessionData.uniqueSensitivePathsAccessed.includes(path)) { + sessionData.uniqueSensitivePathsAccessed.push(path) + } + + // Apply score - smaller penalty for first hit, larger for repeated behavior + if (sessionData.maybeSensitivePathHits === 1) { + scoreFactors.MAYBE_SENSITIVE_PATH = addFactor( + 'MAYBE_SENSITIVE_PATH', + BEHAVIOR_WEIGHTS.MAYBE_SENSITIVE_PATH, + { path, hitCount: 1 }, + `First access to potentially sensitive path: ${path}`, + ) + } + else if (sessionData.maybeSensitivePathHits > 1) { + // Multiple hits to sensitive paths is more suspicious + const weight = BEHAVIOR_WEIGHTS.MAYBE_SENSITIVE_PATH * Math.min(3, sessionData.maybeSensitivePathHits) + scoreFactors.MAYBE_SENSITIVE_PATH = addFactor( + 'MAYBE_SENSITIVE_PATH', + weight, + { path, hitCount: sessionData.maybeSensitivePathHits }, + `Multiple access to sensitive paths (${sessionData.maybeSensitivePathHits} hits)`, + ) + + // If they hit multiple different sensitive paths, that's even more suspicious + if (sessionData.uniqueSensitivePathsAccessed.length >= 2) { + scoreFactors.MULTIPLE_SENSITIVE_HITS = addFactor( + 'MULTIPLE_SENSITIVE_HITS', + BEHAVIOR_WEIGHTS.MULTIPLE_SENSITIVE_HITS, + { uniquePaths: sessionData.uniqueSensitivePathsAccessed }, + `Scanning behavior: ${sessionData.uniqueSensitivePathsAccessed.length} different sensitive paths`, + ) + } + } + } + + // Calculate time since previous request if applicable + let timeSincePrevious = 0 + if (sessionData.lastRequests.length > 0) { + timeSincePrevious = now - sessionData.lastRequests[sessionData.lastRequests.length - 1].timestamp + } + + // Track this request with enhanced metadata + sessionData.lastRequests.push({ + timestamp: now, + path, + method, + timeSincePrevious, + }) + + // Track HTTP method variety + if (!sessionData.requestMethodVariety.includes(method)) { + sessionData.requestMethodVariety.push(method) + } + + // Only keep last 30 requests for better pattern analysis + if (sessionData.lastRequests.length > 30) { + sessionData.lastRequests.shift() + } + + // Apply time decay to previous scores (reduce by ~10% per hour) + const hoursSinceLastUpdate = (now - sessionData.lastUpdated) / (1000 * 60 * 60) + if (hoursSinceLastUpdate > 0) { + sessionData.score = Math.max(0, sessionData.score * 0.9 ** hoursSinceLastUpdate) + } + + // Associate this session with the IP if not already tracked + if (!ipData.activeSessions.includes(behavior.id)) { + ipData.activeSessions.push(behavior.id) + ipData.sessionCount = ipData.activeSessions.length + + // Calculate session creation rate + if (ipData.lastSessionCreated) { + const hoursSinceLastSession = (now - ipData.lastSessionCreated) / (1000 * 60 * 60) + + if (hoursSinceLastSession < 1) { + // If creating sessions more than once per hour, track the rate + ipData.sessionsPerHour = ipData.sessionsPerHour + ? (ipData.sessionsPerHour * 0.7 + (1 / hoursSinceLastSession) * 0.3) // Weighted average + : (1 / hoursSinceLastSession) + } + } + ipData.lastSessionCreated = now + } + + // 1. Check for sensitive path access + if (SENSITIVE_PATHS.some(sensitivePath => path.includes(sensitivePath))) { + sessionData.suspiciousPathHits++ + scoreFactors.SENSITIVE_PATH = addFactor( + 'SENSITIVE_PATH', + BEHAVIOR_WEIGHTS.SENSITIVE_PATH, + { path, hitCount: sessionData.suspiciousPathHits }, + `Access to highly sensitive path: ${path}`, + ) + } + + // 2. Check for rapid requests with adaptive rate limiting + const oneMinuteAgo = now - 60000 + const requestsLastMinute = sessionData.lastRequests.filter(req => req.timestamp > oneMinuteAgo).length + const adaptiveRateLimit = calculateRateLimit(sessionData) + + if (requestsLastMinute > adaptiveRateLimit) { + // Apply score proportional to how much the limit was exceeded + const overageRatio = requestsLastMinute / adaptiveRateLimit + const weight = Math.min( + BEHAVIOR_WEIGHTS.RAPID_REQUESTS * overageRatio, + BEHAVIOR_WEIGHTS.RAPID_REQUESTS * 2, // Cap at double the weight + ) + scoreFactors.RAPID_REQUESTS = addFactor( + 'RAPID_REQUESTS', + weight, + { requestsLastMinute, rateLimit: adaptiveRateLimit, overageRatio }, + `Too many requests: ${requestsLastMinute}/${adaptiveRateLimit} (${Math.round(overageRatio * 100)}% over limit)`, + ) + } + + // 3. Analyze request sequence for natural browsing patterns + if (sessionData.lastRequests.length >= 5) { + const sequenceAnalysis = analyzeRequestSequence(sessionData.lastRequests) + sessionData.requestSequenceEntropy = sequenceAnalysis.entropy + + if (!sequenceAnalysis.isNaturalBrowsing) { + scoreFactors.UNUSUAL_PATTERN = BEHAVIOR_WEIGHTS.UNUSUAL_PATTERN + * (1 - Math.min(1, sequenceAnalysis.entropy / 2)) + } + else { + // Reduce score for natural browsing patterns - positive reinforcement + sessionData.score = Math.max(0, sessionData.score - 5) + sessionData.knownGoodActions += 1 + } + } + + // 4. Check for session anomaly - add anomaly detection logic here + const sessionAnomaly = detectSessionAnomaly(ipData, sessionData, timestamp) + if (sessionAnomaly.suspicious) { + scoreFactors.SESSION_ANOMALY = Math.min( + BEHAVIOR_WEIGHTS.SESSION_ANOMALY, + sessionAnomaly.severity, + ) + } + + // 5. Check request timing consistency + // Bots often have very consistent intervals between requests + if (sessionData.lastRequests.length > 5) { + // Only analyze the existing requests, not including the one just added + // This prevents the new request from breaking the pattern analysis + const existingRequests = sessionData.lastRequests.slice(0, -1) + + // Extract intervals only from requests that have a valid timeSincePrevious value + const intervals: number[] = [] + for (let i = 0; i < existingRequests.length; i++) { + const timeSincePrevious = existingRequests[i]?.timeSincePrevious + if (timeSincePrevious && timeSincePrevious > 0) { + intervals.push(timeSincePrevious) + } + } + + // Only proceed if we have enough intervals to analyze + if (intervals.length >= 4) { + // Calculate mean and standard deviation + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + const stdDev = Math.sqrt(variance) + + // Very low standard deviation indicates suspiciously consistent timing + const coefficientOfVariation = stdDev / mean + + // Enhanced scoring logic with multiple tiers of suspicion: + + // Extremely precise timing (practically impossible for humans) + if (coefficientOfVariation < 0.05 && intervals.length >= 5) { + // This is absolutely a bot - humans cannot maintain this precision + scoreFactors.RESOURCE_TIMING = BEHAVIOR_WEIGHTS.RESOURCE_TIMING * 3 // Triple the weight + + // If very fast as well (sub-second), even more suspicious + if (mean < 1000) { + scoreFactors.RESOURCE_TIMING += 15 // Additional penalty for inhuman speed + } + } + // Very suspicious timing + else if (coefficientOfVariation < 0.1 && mean < 2000) { + // Highly suspicious but not impossible + scoreFactors.RESOURCE_TIMING = BEHAVIOR_WEIGHTS.RESOURCE_TIMING * 2 // Double the weight + } + // Somewhat suspicious timing + else if (coefficientOfVariation < 0.2 && mean < 3000) { + // Still suspicious but less so + scoreFactors.RESOURCE_TIMING = BEHAVIOR_WEIGHTS.RESOURCE_TIMING + } + + // Update average time between requests for future analysis + sessionData.averageTimeBetweenRequests = mean + } + } + + // Add up all score factors + const additionalScore = Object.values(scoreFactors).reduce((sum, val) => sum + val, 0) + sessionData.score += additionalScore + + // Update traffic type classification based on score + if (sessionData.score >= BOT_SCORE_THRESHOLDS.DEFINITELY_BOT) { + sessionData.trafficType = TrafficType.MALICIOUS_BOT + } + else if (sessionData.score >= BOT_SCORE_THRESHOLDS.LIKELY_BOT) { + sessionData.trafficType = TrafficType.SUSPICIOUS + } + else if (sessionData.score >= BOT_SCORE_THRESHOLDS.SUSPICIOUS) { + sessionData.trafficType = TrafficType.SUSPICIOUS + } + else { + sessionData.trafficType = TrafficType.REGULAR_USER + } + + // Cap score at 100 + sessionData.score = Math.min(100, sessionData.score) + + // Update IP score based on session score with memory effect + // This allows the IP to be marked as suspicious based on behavior across multiple sessions + ipData.suspiciousScore = Math.max( + ipData.suspiciousScore * 0.9, // Decay previous score + sessionData.score * 0.8, // Influence from current session + ) + + // Increment legitimate session count if this seems to be a real user + if (sessionData.score <= BOT_SCORE_THRESHOLDS.PROBABLY_HUMAN + && sessionData.knownGoodActions >= 3) { + ipData.legitSessionsCount++ + + // If an IP has many legitimate sessions, gradually reduce its suspicious score + if (ipData.legitSessionsCount > 5 && ipData.suspiciousScore > 0) { + ipData.suspiciousScore = Math.max(0, ipData.suspiciousScore - 5) + } + } + + // Save data back to storage + sessionData.lastUpdated = now + ipData.lastUpdated = now + + behavior.ip.isBot = sessionData.score >= BOT_SCORE_THRESHOLDS.LIKELY_BOT + behavior.ip.isBotConfidence = (sessionData.score + ipData.suspiciousScore) / 2 + + behavior.session = sessionData + behavior.ip = ipData + + // Add debug information if requested + if (debug) { + const sessionAge = now - sessionData.firstSeenAt + const avgInterval = sessionData.lastRequests.length > 1 + ? sessionData.lastRequests.reduce((sum, req, i) => { + if (i === 0) + return 0 + return sum + (req.timestamp - sessionData.lastRequests[i - 1].timestamp) + }, 0) / (sessionData.lastRequests.length - 1) + : 0 + + behavior.debug = { + sessionAge, + requestCount: sessionData.lastRequests.length, + pathHistory: sessionData.lastRequests.map(r => r.path), + timingAnalysis: { + avgInterval, + consistency: sessionData.requestSequenceEntropy, + entropy: sessionData.requestSequenceEntropy, + }, + factors: detectionFactors, + ipInfo: { + sessionCount: ipData.sessionCount, + totalScore: ipData.suspiciousScore, + isBlocked: false, // TODO: get from IP checking + isTrusted: false, // TODO: get from IP checking + }, + confidence: behavior.ip.isBotConfidence || 0, + reasoning, + } + } +} + +// Enhanced bot detection with improved behavior analysis + +// Update bot score after request completion (to account for status codes) +export function applyBehaviorForErrorPages( + status: number, + behavior: BotDetectionBehavior, +) { + const sessionData = behavior.session! + + // Update the last request with the status code + if (sessionData.lastRequests.length > 0) { + sessionData.lastRequests[sessionData.lastRequests.length - 1].status = status + } + + // Count errors (404s, 403s, etc.) + if (status >= 400) { + sessionData.errorCount++ + + // Add score for repeated errors with progressive penalty + if (sessionData.errorCount > 2) { + // Apply increasing penalty for each error after the first few + const errorPenalty = Math.min( + BEHAVIOR_WEIGHTS.REPEATED_ERRORS, + BEHAVIOR_WEIGHTS.REPEATED_ERRORS * (sessionData.errorCount - 2) / 5, + ) + sessionData.score += errorPenalty + + // Check for consecutive errors + const recentRequests = sessionData.lastRequests.slice(-5) + const consecutiveErrors = recentRequests.filter(req => req.status && req.status >= 400).length + + if (consecutiveErrors >= 3) { + // Strong bot signal: consecutive errors + sessionData.score += BEHAVIOR_WEIGHTS.REPEATED_ERRORS + + // If a session shows significant behavior changes, make note of this + if (!sessionData.behaviorChangePoints) { + sessionData.behaviorChangePoints = [] + } + + // If we have a consecutive error pattern, consider this a behavior change point + sessionData.behaviorChangePoints.push(Date.now()) + } + } + + // 404s might indicate scanning for vulnerabilities + if (status === 404) { + // Apply smaller penalty for resource 404s + sessionData.score += BEHAVIOR_WEIGHTS.NONEXISTENT_RESOURCES * 0.3 + } + + // Cap score at 100 + sessionData.score = Math.min(100, sessionData.score) + + // Update traffic type if needed + if (sessionData.score >= BOT_SCORE_THRESHOLDS.DEFINITELY_BOT) { + sessionData.trafficType = TrafficType.MALICIOUS_BOT + } + else if (sessionData.score >= BOT_SCORE_THRESHOLDS.LIKELY_BOT) { + sessionData.trafficType = TrafficType.SUSPICIOUS + } + } + else if (status >= 200 && status < 300) { + // Successful requests may indicate legitimate use + // Especially 2xx on HTML pages + // Slightly reduce score for successful HTML page views + sessionData.score = Math.max(0, sessionData.score - 1) + sessionData.knownGoodActions += 0.5 + } + + // Update IP storage if the score changed significantly + if (Math.abs(sessionData.score - (sessionData.lastScore || 0)) > 10) { + if (behavior.ip) { + // If this session suddenly became very suspicious, update IP score immediately + if (sessionData.score >= BOT_SCORE_THRESHOLDS.LIKELY_BOT + && (sessionData.lastScore || 0) < BOT_SCORE_THRESHOLDS.SUSPICIOUS) { + behavior.ip.suspiciousScore = Math.max(behavior.ip.suspiciousScore, sessionData.score * 0.8) + } + } + + // Remember the last score for future comparisons + sessionData.lastScore = sessionData.score + } +} diff --git a/libs/is-bot/src/behaviors/README.md b/libs/is-bot/src/behaviors/README.md new file mode 100644 index 00000000..bac10023 --- /dev/null +++ b/libs/is-bot/src/behaviors/README.md @@ -0,0 +1,174 @@ +# Bot Detection Behaviors + +This directory contains modular bot detection behaviors that can be enabled/disabled independently. Each behavior is categorized by complexity and reliability. + +## 🟢 Simple Behaviors (Recommended for Production) + +### Path Analysis (`path-analysis.ts`) +- **What it does**: Checks for access to sensitive paths like `/wp-admin`, `/.env`, `/admin` +- **Reliability**: Very High +- **Complexity**: Low +- **False Positives**: Very Low +- **Recommendation**: ✅ Always enable + +### Basic Timing (`timing-analysis.ts` - `analyzeBasicTiming`) +- **What it does**: Detects robotic timing patterns (too consistent intervals) +- **Reliability**: High +- **Complexity**: Low +- **False Positives**: Low +- **Recommendation**: ✅ Enable for most sites + +### Basic Rate Limiting (`rate-limiting.ts` - `analyzeBasicRateLimit`) +- **What it does**: Simple request rate checking with fixed thresholds +- **Reliability**: High +- **Complexity**: Low +- **False Positives**: Low +- **Recommendation**: ✅ Enable with appropriate thresholds + +### Basic User Agent (`user-agent-analysis.ts` - `analyzeBasicUserAgent`) +- **What it does**: Checks for missing/suspicious user agents and bot signatures +- **Reliability**: High +- **Complexity**: Low +- **False Positives**: Very Low +- **Recommendation**: ✅ Always enable + +### Simple Patterns (`intent-analysis.ts` - `analyzeSimplePatterns`) +- **What it does**: Detects obvious scanning patterns and sequential ID enumeration +- **Reliability**: High +- **Complexity**: Low +- **False Positives**: Low +- **Recommendation**: ✅ Enable for most sites + +### Basic Positive Signals (`positive-signals.ts` - `analyzeBasicPositiveSignals`) +- **What it does**: Rewards search engine referrers, reasonable timing, auth sessions +- **Reliability**: High +- **Complexity**: Low +- **False Positives**: Very Low +- **Recommendation**: ✅ Always enable + +## 🟡 Intermediate Behaviors (Use with Caution) + +### Burst Detection (`rate-limiting.ts` - `analyzeBurstPattern`) +- **What it does**: Detects sudden spikes in request activity +- **Reliability**: Medium +- **Complexity**: Medium +- **False Positives**: Medium (can trigger during legitimate browsing spikes) +- **Recommendation**: ⚠️ Test thoroughly before production + +### Header Consistency (`user-agent-analysis.ts` - `analyzeHeaderConsistency`) +- **What it does**: Checks for missing/inconsistent browser headers +- **Reliability**: Medium +- **Complexity**: Medium +- **False Positives**: Medium (some legitimate tools have minimal headers) +- **Recommendation**: ⚠️ Consider for high-security environments + +### Contextual Rate Limiting (`rate-limiting.ts` - `analyzeContextualRateLimit`) +- **What it does**: Adaptive rate limits based on user context and intent +- **Reliability**: Medium +- **Complexity**: High +- **False Positives**: Medium +- **Recommendation**: ⚠️ Requires careful tuning + +## 🔴 Advanced Behaviors (Experimental - High Risk) + +### Advanced Timing (`timing-analysis.ts` - `analyzeAdvancedTiming`) +- **What it does**: Complex timing pattern analysis including periodic and mathematical progressions +- **Reliability**: Low-Medium +- **Complexity**: Very High +- **False Positives**: High (complex timing can have false patterns) +- **Recommendation**: ❌ Not recommended for production + +### Advanced Intent (`intent-analysis.ts` - `analyzeAdvancedIntent`) +- **What it does**: Complex behavioral analysis for navigation patterns and diversity +- **Reliability**: Low-Medium +- **Complexity**: Very High +- **False Positives**: High +- **Recommendation**: ❌ Experimental only + +### Browser Fingerprinting (`user-agent-analysis.ts` - `analyzeBrowserFingerprint`) +- **What it does**: Complex browser entropy and header order analysis +- **Reliability**: Low +- **Complexity**: Very High +- **False Positives**: Very High +- **Recommendation**: ❌ Not suitable for production + +### Advanced Positive Signals (`positive-signals.ts` - `analyzeAdvancedPositiveSignals`) +- **What it does**: Complex credibility building and behavioral learning +- **Reliability**: Medium +- **Complexity**: High +- **False Positives**: Medium +- **Recommendation**: ⚠️ Requires significant testing + +### Behavioral Credibility (`positive-signals.ts` - `analyzeBehavioralCredibility`) +- **What it does**: ML-like behavioral scoring with multiple factors +- **Reliability**: Low-Medium +- **Complexity**: Very High +- **False Positives**: High +- **Recommendation**: ❌ Research/experimental only + +## Configuration Example + +```typescript +import { setBehaviorConfig } from './modular-analyzer' + +// Conservative production config +setBehaviorConfig({ + simple: { + pathAnalysis: { enabled: true, weight: 1.0 }, + basicTiming: { enabled: true, weight: 0.8 }, + basicRateLimit: { enabled: true, weight: 1.0 }, + basicUserAgent: { enabled: true, weight: 1.0 }, + simplePatterns: { enabled: true, weight: 1.0 }, + basicPositiveSignals: { enabled: true, weight: 1.0 } + }, + intermediate: { + burstDetection: { enabled: false, weight: 0.8 }, + headerConsistency: { enabled: false, weight: 0.7 }, + contextualRateLimit: { enabled: false, weight: 0.9 } + }, + advanced: { + // All disabled for production + advancedTiming: { enabled: false, weight: 0.6 }, + advancedIntent: { enabled: false, weight: 0.5 }, + browserFingerprint: { enabled: false, weight: 0.4 }, + advancedPositiveSignals: { enabled: false, weight: 0.6 }, + behavioralCredibility: { enabled: false, weight: 0.3 } + } +}) +``` + +## Recommendations by Site Type + +### **E-commerce / High Traffic** +- Enable: All simple behaviors +- Consider: Basic burst detection (with higher thresholds) +- Avoid: All advanced behaviors + +### **Content Sites / Blogs** +- Enable: All simple behaviors except aggressive rate limiting +- Consider: Header consistency for comment spam +- Avoid: Complex timing analysis + +### **APIs / Developer Tools** +- Enable: Path analysis, user agent, simple patterns +- Consider: Contextual rate limiting +- Avoid: Timing analysis (legitimate tools vary) + +### **High Security / Admin Panels** +- Enable: All simple + intermediate behaviors +- Consider: Advanced positive signals for known users +- Monitor: All behaviors in non-blocking mode first + +## Testing Strategy + +1. **Start Simple**: Enable only green behaviors initially +2. **Monitor**: Use debug mode to see behavior outputs +3. **Gradual Addition**: Add one intermediate behavior at a time +4. **A/B Test**: Compare detection rates and false positives +5. **Never in Production**: Don't enable red behaviors in production + +## Performance Notes + +- Simple behaviors: Minimal performance impact +- Intermediate behaviors: Slight performance impact +- Advanced behaviors: Significant performance impact and maintenance overhead diff --git a/libs/is-bot/src/behaviors/index.ts b/libs/is-bot/src/behaviors/index.ts new file mode 100644 index 00000000..9544f6c6 --- /dev/null +++ b/libs/is-bot/src/behaviors/index.ts @@ -0,0 +1,73 @@ +// Modular bot detection behaviors +// Each behavior can be enabled/disabled and configured independently + +export * from './intent-analysis' +export * from './path-analysis' +export * from './positive-signals' +export * from './rate-limiting' +export * from './timing-analysis' +export * from './user-agent-analysis' + +// Behavior categories by complexity and reliability +export const SIMPLE_BEHAVIORS = { + // High reliability, low complexity - recommended for production + pathAnalysis: 'analyzePathAccess', + basicTiming: 'analyzeBasicTiming', + basicRateLimit: 'analyzeBasicRateLimit', + basicUserAgent: 'analyzeBasicUserAgent', + simplePatterns: 'analyzeSimplePatterns', + basicPositiveSignals: 'analyzeBasicPositiveSignals', +} as const + +export const INTERMEDIATE_BEHAVIORS = { + // Medium complexity, good reliability - use with caution + burstDetection: 'analyzeBurstPattern', + headerConsistency: 'analyzeHeaderConsistency', + contextualRateLimit: 'analyzeContextualRateLimit', +} as const + +export const ADVANCED_BEHAVIORS = { + // High complexity, higher error rate - experimental + advancedTiming: 'analyzeAdvancedTiming', + advancedIntent: 'analyzeAdvancedIntent', + browserFingerprint: 'analyzeBrowserFingerprint', + advancedPositiveSignals: 'analyzeAdvancedPositiveSignals', + behavioralCredibility: 'analyzeBehavioralCredibility', +} as const + +// Configuration interface +export interface BehaviorConfig { + enabled: boolean + weight: number // Multiplier for the behavior's score + threshold?: number // Custom threshold for this behavior +} + +export interface BotDetectionBehaviorConfig { + simple: Record + intermediate: Record + advanced: Record +} + +// Default configuration - only simple behaviors enabled +export const DEFAULT_BEHAVIOR_CONFIG: BotDetectionBehaviorConfig = { + simple: { + pathAnalysis: { enabled: true, weight: 1.0 }, + basicTiming: { enabled: true, weight: 1.0 }, + basicRateLimit: { enabled: true, weight: 1.0 }, + basicUserAgent: { enabled: true, weight: 1.0 }, + simplePatterns: { enabled: true, weight: 1.0 }, + basicPositiveSignals: { enabled: true, weight: 1.0 }, + }, + intermediate: { + burstDetection: { enabled: false, weight: 0.8 }, + headerConsistency: { enabled: false, weight: 0.7 }, + contextualRateLimit: { enabled: false, weight: 0.9 }, + }, + advanced: { + advancedTiming: { enabled: false, weight: 0.6 }, + advancedIntent: { enabled: false, weight: 0.5 }, + browserFingerprint: { enabled: false, weight: 0.4 }, + advancedPositiveSignals: { enabled: false, weight: 0.6 }, + behavioralCredibility: { enabled: false, weight: 0.3 }, + }, +} diff --git a/libs/is-bot/src/behaviors/intent-analysis.ts b/libs/is-bot/src/behaviors/intent-analysis.ts new file mode 100644 index 00000000..1a9c1f4d --- /dev/null +++ b/libs/is-bot/src/behaviors/intent-analysis.ts @@ -0,0 +1,183 @@ +// User intent analysis behavior +import type { SessionData } from '../behavior' + +/** + * Simple pattern detection for obvious bot behavior + * Low complexity, high confidence + */ +export function analyzeSimplePatterns(sessionData: SessionData): { score: number, reason: string } { + if (sessionData.lastRequests.length < 5) { + return { score: 0, reason: 'insufficient-data' } + } + + const paths = sessionData.lastRequests.map(r => r.path) + + // Check for obvious scanning patterns + const scanningIndicators = [ + /\/admin/, + /\/wp-admin/, + /\/login/, + /\.php$/, + /\.asp$/, + /config/, + /backup/, + ] + + let scanningHits = 0 + for (const path of paths) { + for (const pattern of scanningIndicators) { + if (pattern.test(path)) { + scanningHits++ + break + } + } + } + + // If more than 50% of requests hit scanning patterns + if (scanningHits / paths.length > 0.5) { + return { score: 35, reason: `scanning-pattern: ${scanningHits}/${paths.length} hits` } + } + + // Check for sequential numeric patterns (id scanning) + const numericPaths = paths.filter(p => /\/\d+/.test(p)) + if (numericPaths.length >= 3) { + const numbers = numericPaths.map((p) => { + const match = p.match(/\/(\d+)/) + return match ? Number.parseInt(match[1]) : 0 + }).sort((a, b) => a - b) + + // Check if sequential + let sequential = true + for (let i = 1; i < numbers.length; i++) { + if (numbers[i] !== numbers[i - 1] + 1) { + sequential = false + break + } + } + + if (sequential) { + return { score: 40, reason: 'sequential-id-scanning' } + } + } + + return { score: 0, reason: 'normal-patterns' } +} + +/** + * Advanced intent recognition with multiple behavioral indicators + * High complexity, higher chance of false positives + */ +export function analyzeAdvancedIntent(sessionData: SessionData): { score: number, reason: string } { + if (sessionData.lastRequests.length < 8) { + return { score: 0, reason: 'insufficient-data' } + } + + const paths = sessionData.lastRequests.map(r => r.path) + const recentPaths = paths.slice(-10) + + // Analyze navigation patterns + const navAnalysis = analyzeNavigationPatterns(recentPaths) + if (navAnalysis.suspicious) { + return { score: navAnalysis.score, reason: navAnalysis.reason } + } + + // Analyze path diversity + const diversityAnalysis = analyzePathDiversity(recentPaths) + if (diversityAnalysis.suspicious) { + return { score: diversityAnalysis.score, reason: diversityAnalysis.reason } + } + + // Analyze error patterns + const errorAnalysis = analyzeErrorPatterns(sessionData.lastRequests) + if (errorAnalysis.suspicious) { + return { score: errorAnalysis.score, reason: errorAnalysis.reason } + } + + return { score: 0, reason: 'normal-advanced-intent' } +} + +function analyzeNavigationPatterns(paths: string[]): { suspicious: boolean, score: number, reason: string } { + // Check for logical navigation flow + const hasLogicalFlow = checkLogicalFlow(paths) + if (!hasLogicalFlow) { + // Check if it's random or systematic + const pathSet = new Set(paths) + const uniqueRatio = pathSet.size / paths.length + + if (uniqueRatio > 0.9) { + return { suspicious: true, score: 25, reason: 'random-navigation-pattern' } + } + + // Check for alphabetical ordering + const sorted = [...paths].sort() + const isAlphabetical = paths.join('') === sorted.join('') + if (isAlphabetical) { + return { suspicious: true, score: 30, reason: 'alphabetical-scanning' } + } + } + + return { suspicious: false, score: 0, reason: 'normal-navigation' } +} + +function analyzePathDiversity(paths: string[]): { suspicious: boolean, score: number, reason: string } { + const sections = new Set(paths.map(p => `/${p.split('/')[1]}`).filter(Boolean)) + + // Too many different sections too quickly + if (sections.size > paths.length * 0.7 && paths.length > 5) { + return { suspicious: true, score: 20, reason: 'excessive-path-diversity' } + } + + // All requests to same deep path structure + const pathStructures = paths.map(p => p.split('/').slice(0, 3).join('/')) + const structureSet = new Set(pathStructures) + if (structureSet.size === 1 && paths.length > 6) { + return { suspicious: true, score: 15, reason: 'narrow-path-focus' } + } + + return { suspicious: false, score: 0, reason: 'normal-diversity' } +} + +function analyzeErrorPatterns(requests: Array<{ path: string, status?: number }>): { suspicious: boolean, score: number, reason: string } { + const recentRequests = requests.slice(-10) + const errorCount = recentRequests.filter(r => r.status && r.status >= 400).length + + // Too many errors suggests probing + if (errorCount > recentRequests.length * 0.6) { + return { suspicious: true, score: 25, reason: 'excessive-error-generation' } + } + + // No errors at all can also be suspicious for exploration + if (errorCount === 0 && recentRequests.length > 8) { + const uniquePaths = new Set(recentRequests.map(r => r.path)) + if (uniquePaths.size === recentRequests.length) { + return { suspicious: true, score: 10, reason: 'error-free-exploration' } + } + } + + return { suspicious: false, score: 0, reason: 'normal-error-pattern' } +} + +function checkLogicalFlow(paths: string[]): boolean { + // Very simple check for logical navigation + const hasHome = paths.some(p => p === '/' || p === '') + const hasDeepPaths = paths.some(p => p.split('/').length > 3) + + // Basic logical flow: start at home or main sections, then go deeper + if (hasHome && hasDeepPaths) { + return true + } + + // Check for common navigation patterns + const commonPatterns = [ + /^\/$/, // Home + /^\/[^/]+$/, // Main section + /^\/[^/]+\/[^/]+$/, // Subsection + ] + + const patternMatches = commonPatterns.map(pattern => + paths.some(path => pattern.test(path)), + ) + + // If matches multiple levels, consider it logical + return patternMatches.filter(Boolean).length >= 2 +} diff --git a/libs/is-bot/src/behaviors/path-analysis.ts b/libs/is-bot/src/behaviors/path-analysis.ts new file mode 100644 index 00000000..851ec8a2 --- /dev/null +++ b/libs/is-bot/src/behaviors/path-analysis.ts @@ -0,0 +1,79 @@ +// Path-based bot detection behavior +import type { H3Event } from 'h3' +import type { ImprovedDetectionContext, SiteProfile } from '../improved-behavior' +import { getResponseStatus } from 'h3' + +/** + * Simple path-based detection - checks for sensitive paths + * Low complexity, high reliability + */ +export function analyzePathAccess( + path: string, + _context: ImprovedDetectionContext, +): { score: number, reason: string } { + // Very high confidence malicious patterns + const highRiskPatterns = [ + /wp-config\.php/, + /\.env$/, + /phpmyadmin/, + /admin\.php/, + /wp-login\.php/, + ] + + const mediumRiskPatterns = [ + /\/admin$/, + /\/login$/, + /\/dashboard$/, + /\/config$/, + ] + + // Check for high-risk patterns + for (const pattern of highRiskPatterns) { + if (pattern.test(path)) { + return { score: 40, reason: `high-risk-path: ${path}` } + } + } + + // Check for medium-risk patterns + for (const pattern of mediumRiskPatterns) { + if (pattern.test(path)) { + return { score: 15, reason: `medium-risk-path: ${path}` } + } + } + + return { score: 0, reason: 'normal-path' } +} + +/** + * Build basic site profile by tracking successful responses + * Simple and reliable + */ +export function buildBasicSiteProfile(event: H3Event, existingProfile?: SiteProfile): SiteProfile { + const profile = existingProfile || { + detectedCMS: 'unknown', + hasAdminArea: false, + adminPaths: [], + apiEndpoints: [], + existingPaths: new Set(), + userAgentPatterns: new Map(), + legitimateAccessPatterns: [], + } + + const path = event.path || '' + const status = getResponseStatus(event) + + // Only track successful responses + if (status >= 200 && status < 300) { + profile.existingPaths.add(path) + + // Simple CMS detection + if (path.includes('/wp-') || path.includes('wp-admin')) { + profile.detectedCMS = 'wordpress' + } + else if (path.includes('/_nuxt/')) { + profile.detectedCMS = 'nuxt' + } + } + + return profile +} diff --git a/libs/is-bot/src/behaviors/positive-signals.ts b/libs/is-bot/src/behaviors/positive-signals.ts new file mode 100644 index 00000000..d5c17b9e --- /dev/null +++ b/libs/is-bot/src/behaviors/positive-signals.ts @@ -0,0 +1,188 @@ +// Positive signals that indicate legitimate users +import type { SessionData } from '../behavior' +import type { ImprovedDetectionContext } from '../improved-behavior' + +/** + * Simple positive signals - clear indicators of human behavior + * Low complexity, high confidence + */ +export function analyzeBasicPositiveSignals( + headers: Record, + sessionData: SessionData, +): { score: number, reason: string } { + let positiveScore = 0 + const reasons = [] + + // Legitimate referrer + const referrer = headers.referer || headers.referrer || '' + if (referrer && ( + referrer.includes('google.com') + || referrer.includes('bing.com') + || referrer.includes('duckduckgo.com') + )) { + positiveScore += 10 + reasons.push('search-engine-referrer') + } + + // Time spent reading (reasonable intervals between requests) + if (sessionData.lastRequests.length >= 3) { + const intervals = [] + for (let i = 1; i < sessionData.lastRequests.length; i++) { + const interval = sessionData.lastRequests[i].timestamp - sessionData.lastRequests[i - 1].timestamp + intervals.push(interval) + } + + const avgInterval = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + + // 5-120 seconds between requests suggests reading + if (avgInterval > 5000 && avgInterval < 120000) { + positiveScore += 15 + reasons.push('content-engagement') + } + } + + // Standard browser headers + if (headers['accept-language'] && headers['accept-encoding']) { + positiveScore += 5 + reasons.push('complete-headers') + } + + // Authentication cookies (if present) + const cookies = headers.cookie || '' + if (cookies.includes('session') || cookies.includes('auth') || cookies.includes('login')) { + positiveScore += 20 + reasons.push('authenticated-session') + } + + if (positiveScore > 0) { + return { score: -positiveScore, reason: `positive-signals: ${reasons.join(', ')}` } + } + + return { score: 0, reason: 'no-positive-signals' } +} + +/** + * Advanced positive signal analysis with behavioral learning + * Higher complexity, may be less reliable + */ +export function analyzeAdvancedPositiveSignals( + headers: Record, + sessionData: SessionData, + _context: ImprovedDetectionContext, +): { score: number, reason: string } { + let positiveScore = 0 + const reasons = [] + + // Credibility building over time + if (sessionData.knownGoodActions > 5) { + positiveScore += Math.min(20, sessionData.knownGoodActions * 2) + reasons.push(`credibility-score: ${sessionData.knownGoodActions}`) + } + + // Consistent user agent over session + if (sessionData.lastRequests.length > 5) { + // This would require tracking user agent per request - complex + positiveScore += 5 + reasons.push('consistent-identity') + } + + // Natural error patterns (humans make typos) + const recentRequests = sessionData.lastRequests.slice(-10) + const errorRate = recentRequests.filter(r => r.status === 404).length / recentRequests.length + if (errorRate > 0.05 && errorRate < 0.2) { // 5-20% error rate is human-like + positiveScore += 8 + reasons.push('natural-error-pattern') + } + + // Form interactions (if we tracked them) + // This would require additional tracking infrastructure + + // Mobile vs desktop patterns + const userAgent = headers['user-agent'] || '' + if (userAgent.includes('Mobile') || userAgent.includes('iPhone') || userAgent.includes('Android')) { + // Mobile users often have different patterns + positiveScore += 5 + reasons.push('mobile-device') + } + + // Geographic consistency (would need IP geolocation) + // Complex and requires external services + + if (positiveScore > 0) { + return { score: -positiveScore, reason: `advanced-positive: ${reasons.join(', ')}` } + } + + return { score: 0, reason: 'no-advanced-positive-signals' } +} + +/** + * Machine learning-like behavioral scoring + * Very complex, high maintenance overhead + */ +export function analyzeBehavioralCredibility(sessionData: SessionData): { score: number, reason: string } { + // This would ideally use a trained model, but for now we'll use heuristics + + let credibilityScore = 50 // Start neutral + const factors = [] + + // Session age factor + const sessionAge = Date.now() - sessionData.firstSeenAt + if (sessionAge > 5 * 60 * 1000) { // 5+ minutes + credibilityScore += 10 + factors.push('established-session') + } + + // Request variety + const uniquePaths = new Set(sessionData.lastRequests.map(r => r.path)) + const varietyRatio = uniquePaths.size / sessionData.lastRequests.length + if (varietyRatio > 0.3 && varietyRatio < 0.8) { // Sweet spot for humans + credibilityScore += 5 + factors.push('good-path-variety') + } + + // Timing variance (humans are inconsistent) + if (sessionData.lastRequests.length >= 5) { + const intervals = [] + for (let i = 1; i < sessionData.lastRequests.length; i++) { + const interval = sessionData.lastRequests[i].timestamp - sessionData.lastRequests[i - 1].timestamp + intervals.push(interval) + } + + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + const coefficientOfVariation = Math.sqrt(variance) / mean + + if (coefficientOfVariation > 0.3) { // Good human-like variance + credibilityScore += 8 + factors.push('human-timing-variance') + } + } + + // Error recovery (humans click back, retry, etc.) + const errors = sessionData.lastRequests.filter(r => r.status && r.status >= 400) + if (errors.length > 0 && errors.length < sessionData.lastRequests.length) { + // Check if there were successful requests after errors + const hasRecovery = errors.some(errorReq => + sessionData.lastRequests.some(req => + req.timestamp > errorReq.timestamp && (!req.status || req.status < 400), + ), + ) + + if (hasRecovery) { + credibilityScore += 10 + factors.push('error-recovery-behavior') + } + } + + // Convert credibility score to bot detection score + const adjustment = (credibilityScore - 50) * 0.3 // Scale to reasonable range + + if (Math.abs(adjustment) > 2) { + return { + score: -adjustment, + reason: `behavioral-credibility: ${credibilityScore}/100 (${factors.join(', ')})`, + } + } + + return { score: 0, reason: 'neutral-credibility' } +} diff --git a/libs/is-bot/src/behaviors/rate-limiting.ts b/libs/is-bot/src/behaviors/rate-limiting.ts new file mode 100644 index 00000000..682d0bd8 --- /dev/null +++ b/libs/is-bot/src/behaviors/rate-limiting.ts @@ -0,0 +1,102 @@ +// Rate limiting bot detection behavior +import type { SessionData } from '../behavior' +import type { ImprovedDetectionContext } from '../improved-behavior' + +/** + * Simple rate limiting check + * Low complexity, high reliability + */ +export function analyzeBasicRateLimit(sessionData: SessionData): { score: number, reason: string } { + const now = Date.now() + const oneMinuteAgo = now - 60000 + const requestsLastMinute = sessionData.lastRequests.filter(r => r.timestamp > oneMinuteAgo).length + + // Fixed thresholds - simple and predictable + if (requestsLastMinute > 30) { + return { score: 50, reason: `excessive-requests: ${requestsLastMinute}/min` } + } + + if (requestsLastMinute > 20) { + return { score: 25, reason: `high-requests: ${requestsLastMinute}/min` } + } + + if (requestsLastMinute > 15) { + return { score: 10, reason: `elevated-requests: ${requestsLastMinute}/min` } + } + + return { score: 0, reason: 'normal-rate' } +} + +/** + * Context-aware rate limiting with adaptive thresholds + * Higher complexity, may be error-prone in edge cases + */ +export function analyzeContextualRateLimit( + sessionData: SessionData, + context: ImprovedDetectionContext, +): { score: number, reason: string } { + const now = Date.now() + const oneMinuteAgo = now - 60000 + const requestsLastMinute = sessionData.lastRequests.filter(r => r.timestamp > oneMinuteAgo).length + + // Dynamic rate limits based on context + let rateLimit = 15 // Default + + // Adjust based on authentication status + if (context.authenticationStatus === 'authenticated') { + rateLimit = 30 // Authenticated users get higher limits + } + + // Adjust based on user intent + if (context.userIntent === 'scanning') { + rateLimit = 5 // Very strict for scanners + } + else if (context.userIntent === 'browsing') { + rateLimit = 25 // More lenient for browsers + } + + // Adjust based on referrer + if (context.referrerContext === 'search-engine') { + rateLimit += 5 // Slight bonus for search engine referrals + } + + if (requestsLastMinute > rateLimit) { + const overage = requestsLastMinute - rateLimit + const score = Math.min(50, overage * 3) + return { + score, + reason: `contextual-rate-exceeded: ${requestsLastMinute}/${rateLimit} (intent: ${context.userIntent})`, + } + } + + return { score: 0, reason: 'within-rate-limit' } +} + +/** + * Burst detection - looks for sudden spikes in activity + * Medium complexity, good for catching automated tools + */ +export function analyzeBurstPattern(sessionData: SessionData): { score: number, reason: string } { + if (sessionData.lastRequests.length < 10) { + return { score: 0, reason: 'insufficient-data' } + } + + const now = Date.now() + const intervals = [10000, 30000, 60000] // 10s, 30s, 1min windows + + for (const interval of intervals) { + const windowStart = now - interval + const requestsInWindow = sessionData.lastRequests.filter(r => r.timestamp > windowStart).length + const expectedMax = Math.ceil(interval / 2000) // Rough estimate: 1 request per 2 seconds max + + if (requestsInWindow > expectedMax * 2) { + const windowSeconds = interval / 1000 + return { + score: 30, + reason: `burst-detected: ${requestsInWindow} requests in ${windowSeconds}s`, + } + } + } + + return { score: 0, reason: 'normal-burst-pattern' } +} diff --git a/libs/is-bot/src/behaviors/timing-analysis.ts b/libs/is-bot/src/behaviors/timing-analysis.ts new file mode 100644 index 00000000..42e5ef22 --- /dev/null +++ b/libs/is-bot/src/behaviors/timing-analysis.ts @@ -0,0 +1,104 @@ +// Timing-based bot detection behavior +import type { SessionData } from '../behavior' + +/** + * Basic timing consistency check + * Simple and reliable - checks for robotic timing patterns + */ +export function analyzeBasicTiming(sessionData: SessionData): { score: number, reason: string } { + if (sessionData.lastRequests.length < 5) { + return { score: 0, reason: 'insufficient-data' } + } + + const intervals = [] + for (let i = 1; i < sessionData.lastRequests.length; i++) { + const interval = sessionData.lastRequests[i].timestamp - sessionData.lastRequests[i - 1].timestamp + intervals.push(interval) + } + + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + const stdDev = Math.sqrt(variance) + const coefficientOfVariation = stdDev / mean + + // Very consistent timing is suspicious + if (coefficientOfVariation < 0.05 && mean < 2000) { + return { score: 35, reason: 'robotic-timing-detected' } + } + + // Somewhat consistent timing + if (coefficientOfVariation < 0.15 && mean < 1000) { + return { score: 20, reason: 'suspicious-timing-pattern' } + } + + return { score: 0, reason: 'human-like-timing' } +} + +/** + * Advanced timing analysis with more complex patterns + * Higher complexity, may have false positives + */ +export function analyzeAdvancedTiming(sessionData: SessionData): { score: number, reason: string } { + if (sessionData.lastRequests.length < 10) { + return { score: 0, reason: 'insufficient-data' } + } + + const intervals = [] + for (let i = 1; i < sessionData.lastRequests.length; i++) { + const interval = sessionData.lastRequests[i].timestamp - sessionData.lastRequests[i - 1].timestamp + intervals.push(interval) + } + + // Check for periodic patterns (e.g., every 1000ms, 2000ms, etc.) + const periodicPattern = checkPeriodicPattern(intervals) + if (periodicPattern.detected) { + return { score: 40, reason: `periodic-pattern: ${periodicPattern.period}ms` } + } + + // Check for mathematical progressions + const progression = checkMathematicalProgression(intervals) + if (progression.detected) { + return { score: 30, reason: `mathematical-progression: ${progression.type}` } + } + + return analyzeBasicTiming(sessionData) +} + +function checkPeriodicPattern(intervals: number[]): { detected: boolean, period?: number } { + const tolerance = 50 // 50ms tolerance + + // Check common periods + const commonPeriods = [500, 1000, 1500, 2000, 3000, 5000] + + for (const period of commonPeriods) { + const matches = intervals.filter(interval => + Math.abs(interval - period) <= tolerance, + ) + + if (matches.length >= Math.ceil(intervals.length * 0.6)) { + return { detected: true, period } + } + } + + return { detected: false } +} + +function checkMathematicalProgression(intervals: number[]): { detected: boolean, type?: string } { + if (intervals.length < 5) + return { detected: false } + + // Check arithmetic progression + const diffs = [] + for (let i = 1; i < intervals.length; i++) { + diffs.push(intervals[i] - intervals[i - 1]) + } + + const avgDiff = diffs.reduce((sum, val) => sum + val, 0) / diffs.length + const diffVariance = diffs.reduce((sum, val) => sum + (val - avgDiff) ** 2, 0) / diffs.length + + if (Math.sqrt(diffVariance) < 10 && Math.abs(avgDiff) > 5) { + return { detected: true, type: 'arithmetic' } + } + + return { detected: false } +} diff --git a/libs/is-bot/src/behaviors/user-agent-analysis.ts b/libs/is-bot/src/behaviors/user-agent-analysis.ts new file mode 100644 index 00000000..b5697553 --- /dev/null +++ b/libs/is-bot/src/behaviors/user-agent-analysis.ts @@ -0,0 +1,154 @@ +// User agent and header analysis +import type { SessionData } from '../behavior' + +/** + * Basic user agent validation + * Simple and reliable + */ +export function analyzeBasicUserAgent(headers: Record): { score: number, reason: string } { + const userAgent = headers['user-agent'] || '' + + // Missing user agent + if (!userAgent) { + return { score: 30, reason: 'missing-user-agent' } + } + + // Too short to be real + if (userAgent.length < 20) { + return { score: 25, reason: 'suspicious-user-agent-length' } + } + + // Common bot signatures + const botSignatures = [ + /bot/i, + /crawler/i, + /spider/i, + /scraper/i, + /curl/i, + /wget/i, + /python-requests/i, + ] + + for (const pattern of botSignatures) { + if (pattern.test(userAgent)) { + return { score: 40, reason: 'bot-signature-detected' } + } + } + + return { score: 0, reason: 'normal-user-agent' } +} + +/** + * Advanced header consistency analysis + * Higher complexity, may have false positives + */ +export function analyzeHeaderConsistency(headers: Record): { score: number, reason: string } { + const userAgent = headers['user-agent'] || '' + const acceptLanguage = headers['accept-language'] || '' + const acceptEncoding = headers['accept-encoding'] || '' + const accept = headers.accept || '' + + let suspiciousCount = 0 + const issues = [] + + // Check for basic browser headers + if (!accept) { + suspiciousCount++ + issues.push('missing-accept-header') + } + + if (!acceptLanguage) { + suspiciousCount++ + issues.push('missing-accept-language') + } + + if (!acceptEncoding) { + suspiciousCount++ + issues.push('missing-accept-encoding') + } + + // Check for inconsistencies + if (userAgent.includes('Chrome') && !acceptEncoding.includes('gzip')) { + suspiciousCount++ + issues.push('chrome-without-gzip') + } + + if (userAgent.includes('Mozilla') && !userAgent.includes('Gecko') && !userAgent.includes('WebKit')) { + suspiciousCount++ + issues.push('invalid-mozilla-signature') + } + + // Score based on number of issues + if (suspiciousCount >= 3) { + return { score: 35, reason: `header-inconsistency: ${issues.join(', ')}` } + } + + if (suspiciousCount >= 2) { + return { score: 20, reason: `header-issues: ${issues.join(', ')}` } + } + + if (suspiciousCount >= 1) { + return { score: 10, reason: `minor-header-issue: ${issues.join(', ')}` } + } + + return { score: 0, reason: 'consistent-headers' } +} + +/** + * Browser fingerprinting analysis + * Very complex, high chance of false positives + */ +export function analyzeBrowserFingerprint( + headers: Record, + _sessionData: SessionData, +): { score: number, reason: string } { + const userAgent = headers['user-agent'] || '' + const acceptLanguage = headers['accept-language'] || '' + + // Calculate "fingerprint entropy" + let entropy = 0 + const features = [] + + // User agent entropy + if (userAgent) { + entropy += Math.log2(userAgent.length) + features.push('user-agent') + } + + // Language entropy + if (acceptLanguage) { + const languages = acceptLanguage.split(',').length + entropy += Math.log2(languages + 1) + features.push('languages') + } + + // Client hints + if (headers['sec-ch-ua']) { + entropy += 2 + features.push('client-hints') + } + + // DNT header + if (headers.dnt) { + entropy += 1 + features.push('dnt') + } + + // Very low entropy suggests a bot + if (entropy < 3 && features.length < 2) { + return { score: 25, reason: `low-browser-entropy: ${entropy.toFixed(1)}` } + } + + // Check for header order consistency (browsers typically send headers in specific orders) + const headerOrder = Object.keys(headers) + if (headerOrder.length > 5) { + // This is complex and error-prone - simplified version + const hasStandardOrder = headerOrder.includes('user-agent') + && headerOrder.includes('accept') + if (!hasStandardOrder) { + return { score: 15, reason: 'unusual-header-order' } + } + } + + return { score: 0, reason: 'normal-browser-fingerprint' } +} diff --git a/libs/is-bot/src/core.ts b/libs/is-bot/src/core.ts new file mode 100644 index 00000000..2e41e3c1 --- /dev/null +++ b/libs/is-bot/src/core.ts @@ -0,0 +1,359 @@ +// Core bot detection engine - H3/Nuxt focused +import type { H3Event } from 'h3' +import type { + BotDetectionConfig, + BotDetectionRequest, + BotDetectionResponse, + DetectionContext, + ResponseStatusProvider, + SessionIdentifier, + SiteProfile +} from './types' +import type { IPData, SessionData } from './behavior' +import type { BehaviorStorage } from './adapters/behavior-storage' +import { modularBotAnalysis, DEFAULT_BEHAVIOR_CONFIG, type BotDetectionBehaviorConfig } from './modular-analyzer' +import { type BotDetectionBehavior, TrafficType } from './behavior' + +export class BotDetectionEngine { + private storage: BehaviorStorage + private sessionIdentifier: SessionIdentifier + private responseStatusProvider?: ResponseStatusProvider + private config: BotDetectionConfig + private behaviorConfig: BotDetectionBehaviorConfig + private siteProfile: SiteProfile | null = null + + constructor(options: { + storage: BehaviorStorage + sessionIdentifier: SessionIdentifier + responseStatusProvider?: ResponseStatusProvider + config?: BotDetectionConfig + }) { + this.storage = options.storage + this.sessionIdentifier = options.sessionIdentifier + this.responseStatusProvider = options.responseStatusProvider + this.config = { + session: { + ttl: 24 * 60 * 60 * 1000, // 24 hours + maxSessionsPerIP: 10, + ...options.config?.session + }, + thresholds: { + definitelyBot: 90, + likelyBot: 70, + suspicious: 40, + ...options.config?.thresholds + }, + customSensitivePaths: options.config?.customSensitivePaths || [], + ipFilter: { + trustedIPs: ['127.0.0.1', '::1'], + blockedIPs: [], + ...options.config?.ipFilter + }, + debug: options.config?.debug || false, + behaviors: options.config?.behaviors + } + + // Merge behavior configuration + this.behaviorConfig = { + simple: { ...DEFAULT_BEHAVIOR_CONFIG.simple, ...this.config.behaviors?.simple }, + intermediate: { ...DEFAULT_BEHAVIOR_CONFIG.intermediate, ...this.config.behaviors?.intermediate }, + advanced: { ...DEFAULT_BEHAVIOR_CONFIG.advanced, ...this.config.behaviors?.advanced } + } + } + + async analyze(request: BotDetectionRequest, event?: H3Event): Promise { + const timestamp = request.timestamp || Date.now() + + // Get session ID + const sessionId = await this.sessionIdentifier.getSessionId(request) + + // Check IP blocklist/allowlist + if (this.isIPBlocked(request.ip)) { + return this.createBlockedResponse(sessionId, 'ip-blocked') + } + + if (this.isIPTrusted(request.ip)) { + return this.createTrustedResponse(sessionId, 'ip-trusted') + } + + // Get or create session and IP data + const [sessionData, ipData] = await Promise.all([ + this.getOrCreateSession(sessionId, timestamp), + this.getOrCreateIPData(request.ip, sessionId, timestamp) + ]) + + // Get or create site profile + this.siteProfile = await this.getOrCreateSiteProfile(request) + + // Create behavior object for analysis + const behavior: BotDetectionBehavior = { + id: sessionId, + session: sessionData, + ip: ipData, + dirty: false + } + + // Run modular analysis - requires H3Event + let analysis + if (event) { + analysis = modularBotAnalysis({ + event, + behavior, + config: this.behaviorConfig, + debug: this.config.debug + }) + } else { + // Fallback analysis without H3Event + analysis = { + botScore: this.basicBotScore(request, sessionData, ipData), + confidence: 50, + factors: [], + recommendation: 'allow' as const + } + } + + // Update session data + this.updateSessionData(sessionData, request, timestamp) + + // Update IP data + this.updateIPData(ipData, sessionData, timestamp) + + // Apply response status if available + if (this.responseStatusProvider) { + const status = this.responseStatusProvider.getStatus(request) + if (status) { + this.applyResponseStatus(sessionData, status) + } + } + + // Save updated data + await Promise.all([ + this.storage.setSession(sessionId, sessionData), + this.storage.setIP(request.ip, ipData), + this.siteProfile ? this.storage.setSiteProfile(this.siteProfile) : Promise.resolve() + ]) + + return { + isBot: analysis.botScore >= (this.config.thresholds?.likelyBot || 70), + confidence: analysis.confidence, + score: analysis.botScore, + factors: analysis.factors, + recommendation: analysis.recommendation, + sessionId + } + } + + private isIPBlocked(ip: string): boolean { + return this.config.ipFilter?.blockedIPs?.includes(ip) || false + } + + private isIPTrusted(ip: string): boolean { + return this.config.ipFilter?.trustedIPs?.includes(ip) || false + } + + private async getOrCreateSession(sessionId: string, timestamp: number): Promise { + const existing = await this.storage.getSession(sessionId) + if (existing) { + return existing + } + + return { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: timestamp, + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: timestamp, + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + } + + private async getOrCreateIPData(ip: string, sessionId: string, timestamp: number): Promise { + const existing = await this.storage.getIP(ip) + if (existing) { + // Add session if not already tracked + if (!existing.activeSessions.includes(sessionId)) { + existing.activeSessions.push(sessionId) + existing.sessionCount = existing.activeSessions.length + } + return existing + } + + return { + sessionCount: 1, + activeSessions: [sessionId], + suspiciousScore: 0, + lastUpdated: timestamp, + legitSessionsCount: 0, + factores: [], + isBot: false, + isBotConfidence: 0, + lastSessionCreated: timestamp + } + } + + private async getOrCreateSiteProfile(request: BotDetectionRequest): Promise { + const existing = await this.storage.getSiteProfile() + if (existing) { + return existing + } + + return { + detectedCMS: 'unknown', + hasAdminArea: false, + adminPaths: [], + apiEndpoints: [], + existingPaths: new Set(), + userAgentPatterns: new Map(), + legitimateAccessPatterns: [] + } + } + + private updateSessionData(sessionData: SessionData, request: BotDetectionRequest, timestamp: number) { + // Add current request + sessionData.lastRequests.push({ + timestamp, + path: request.path, + method: request.method, + timeSincePrevious: sessionData.lastRequests.length > 0 + ? timestamp - sessionData.lastRequests[sessionData.lastRequests.length - 1].timestamp + : 0 + }) + + // Keep only last 30 requests + if (sessionData.lastRequests.length > 30) { + sessionData.lastRequests.shift() + } + + // Update method variety + if (!sessionData.requestMethodVariety.includes(request.method)) { + sessionData.requestMethodVariety.push(request.method) + } + + sessionData.lastUpdated = timestamp + } + + private updateIPData(ipData: IPData, sessionData: SessionData, timestamp: number) { + ipData.lastUpdated = timestamp + + // Update IP score based on session score + ipData.suspiciousScore = Math.max( + ipData.suspiciousScore * 0.9, // Decay + sessionData.score * 0.8 // Current session influence + ) + + // Update bot confidence + ipData.isBotConfidence = (sessionData.score + ipData.suspiciousScore) / 2 + ipData.isBot = sessionData.score >= (this.config.thresholds?.likelyBot || 70) + } + + private applyResponseStatus(sessionData: SessionData, status: number) { + // Update the last request with status + if (sessionData.lastRequests.length > 0) { + sessionData.lastRequests[sessionData.lastRequests.length - 1].status = status + } + + // Count errors + if (status >= 400) { + sessionData.errorCount++ + + // Apply error penalty + if (sessionData.errorCount > 2) { + sessionData.score += Math.min(15, sessionData.errorCount * 2) + } + } else if (status >= 200 && status < 300) { + // Successful requests indicate legitimate use + sessionData.score = Math.max(0, sessionData.score - 1) + sessionData.knownGoodActions += 0.5 + } + + // Cap score + sessionData.score = Math.min(100, sessionData.score) + } + + private basicBotScore(request: BotDetectionRequest, sessionData: SessionData, ipData: IPData): number { + let score = 0 + + // Basic user agent check + const userAgent = Array.isArray(request.headers['user-agent']) + ? request.headers['user-agent'][0] || '' + : request.headers['user-agent'] || '' + if (!userAgent || userAgent.length < 20) { + score += 30 + } + + // Check for common bot patterns + const botPatterns = /bot|crawler|spider|scraper|curl|wget|python-requests/i + if (botPatterns.test(userAgent)) { + score += 50 + } + + // Rate limiting - simple check + if (sessionData.lastRequests.length > 10) { + const avgInterval = sessionData.lastRequests.reduce((sum, req, i) => { + return i > 0 ? sum + (req.timestamp - sessionData.lastRequests[i-1].timestamp) : sum + }, 0) / Math.max(1, sessionData.lastRequests.length - 1) + + if (avgInterval < 1000) { // Less than 1 second between requests + score += 25 + } + } + + // Sensitive path access + const sensitivePaths = ['/admin', '/wp-admin', '/.env', '/wp-login'] + if (sensitivePaths.some(path => request.path.includes(path))) { + score += 20 + } + + return Math.min(100, score) + } + + private createBlockedResponse(sessionId: string, reason: string): BotDetectionResponse { + return { + isBot: true, + confidence: 100, + score: 100, + factors: [{ type: 'IP_FILTER', score: 100, reason }], + recommendation: 'block', + sessionId + } + } + + private createTrustedResponse(sessionId: string, reason: string): BotDetectionResponse { + return { + isBot: false, + confidence: 100, + score: 0, + factors: [{ type: 'IP_FILTER', score: -100, reason }], + recommendation: 'allow', + sessionId + } + } + + // Public configuration methods + updateConfig(config: Partial) { + this.config = { ...this.config, ...config } + } + + updateBehaviorConfig(config: Partial) { + this.behaviorConfig = { + simple: { ...this.behaviorConfig.simple, ...config.simple }, + intermediate: { ...this.behaviorConfig.intermediate, ...config.intermediate }, + advanced: { ...this.behaviorConfig.advanced, ...config.advanced } + } + } + + // Cleanup method + async cleanup() { + if (this.storage.cleanup) { + await this.storage.cleanup() + } + } +} \ No newline at end of file diff --git a/libs/is-bot/src/drivers/h3.ts b/libs/is-bot/src/drivers/h3.ts new file mode 100644 index 00000000..9517502f --- /dev/null +++ b/libs/is-bot/src/drivers/h3.ts @@ -0,0 +1,146 @@ +// H3 driver for bot detection +import type { H3Event } from 'h3' +import { getHeaders, getRequestIP, getResponseStatus, useSession } from 'h3' +import type { + BotDetectionDriver, + BotDetectionRequestData, + BotDetectionDriverOptions +} from './types' + +export class H3Driver implements BotDetectionDriver { + private options: BotDetectionDriverOptions + + constructor(options: BotDetectionDriverOptions = {}) { + this.options = { + sessionConfig: { + password: 'default-bot-detection-password', + cookieName: 'nuxt-session', + ...options.sessionConfig + }, + ipExtraction: { + trustProxy: true, + proxyHeaders: ['x-forwarded-for', 'x-real-ip'], + ...options.ipExtraction + }, + debug: options.debug || false + } + } + + extractRequest(event: H3Event): BotDetectionRequestData { + const headers = getHeaders(event) + const ip = getRequestIP(event, { + xForwardedFor: this.options.ipExtraction?.trustProxy + }) || '127.0.0.1' + + const userAgent = this.getHeaderValue(headers, 'user-agent') + const referer = this.getHeaderValue(headers, 'referer') || this.getHeaderValue(headers, 'referrer') + const acceptLanguage = this.getHeaderValue(headers, 'accept-language') + const acceptEncoding = this.getHeaderValue(headers, 'accept-encoding') + + return { + path: event.path || '/', + method: event.method || 'GET', + headers, + ip, + timestamp: Date.now(), + userAgent, + referer, + acceptLanguage, + acceptEncoding + } + } + + async extractSessionId(event: H3Event): Promise { + try { + const session = await useSession(event, { + password: this.options.sessionConfig?.password || 'default-bot-detection-password' + }) + return session.id + } catch (error) { + if (this.options.debug) { + console.warn('Failed to get session, falling back to IP-based session:', error) + } + // Fallback to IP + User Agent hash + return this.generateFallbackSessionId(event) + } + } + + extractResponseStatus(event: H3Event): number | undefined { + try { + return getResponseStatus(event) + } catch { + return undefined + } + } + + isTrustedIP(ip: string): boolean { + // Common trusted IP ranges + const trustedRanges = [ + '127.0.0.1', + '::1', + '10.0.0.0/8', + '172.16.0.0/12', + '192.168.0.0/16' + ] + + // Simple check for exact matches (in production, use proper CIDR matching) + return trustedRanges.some(range => { + if (range.includes('/')) { + // Simplified CIDR check - in production use proper library + const baseIP = range.split('/')[0] + return ip.startsWith(baseIP.substring(0, baseIP.lastIndexOf('.'))) + } + return ip === range + }) + } + + getAdditionalContext(event: H3Event): Record { + const headers = getHeaders(event) + + return { + protocol: headers['x-forwarded-proto'] || 'http', + host: headers.host, + connection: headers.connection, + upgradeInsecureRequests: headers['upgrade-insecure-requests'], + secFetchSite: headers['sec-fetch-site'], + secFetchMode: headers['sec-fetch-mode'], + secFetchUser: headers['sec-fetch-user'], + secFetchDest: headers['sec-fetch-dest'], + secChUa: headers['sec-ch-ua'], + secChUaMobile: headers['sec-ch-ua-mobile'], + secChUaPlatform: headers['sec-ch-ua-platform'] + } + } + + private getHeaderValue(headers: Record, name: string): string | undefined { + const value = headers[name] + if (Array.isArray(value)) { + return value[0] + } + return value + } + + private generateFallbackSessionId(event: H3Event): string { + const headers = getHeaders(event) + const ip = getRequestIP(event, { xForwardedFor: this.options.ipExtraction?.trustProxy }) || '127.0.0.1' + const userAgent = this.getHeaderValue(headers, 'user-agent') || '' + + // Create a deterministic session ID from IP and User Agent + return `fallback-${this.simpleHash(`${ip}-${userAgent}`)}` + } + + private simpleHash(str: string): string { + let hash = 0 + for (let i = 0; i < str.length; i++) { + const char = str.charCodeAt(i) + hash = ((hash << 5) - hash) + char + hash = hash & hash // Convert to 32-bit integer + } + return Math.abs(hash).toString(36) + } +} + +// Convenience function for quick H3 integration +export function createH3BotDetection(options: BotDetectionDriverOptions = {}) { + return new H3Driver(options) +} \ No newline at end of file diff --git a/libs/is-bot/src/drivers/types.ts b/libs/is-bot/src/drivers/types.ts new file mode 100644 index 00000000..b1ca7895 --- /dev/null +++ b/libs/is-bot/src/drivers/types.ts @@ -0,0 +1,51 @@ +// Driver interface types for bot detection +export interface BotDetectionDriver { + /** + * Extract bot detection request data from the framework's request object + */ + extractRequest(request: TRequest): BotDetectionRequestData + + /** + * Extract session ID from the framework's request + */ + extractSessionId(request: TRequest): Promise | string + + /** + * Extract response status from the framework's response (if available) + */ + extractResponseStatus?(request: TRequest, response?: TResponse): number | undefined + + /** + * Check if IP is from a trusted source (e.g., load balancer) + */ + isTrustedIP?(ip: string): boolean + + /** + * Get additional context from the framework + */ + getAdditionalContext?(request: TRequest): Record +} + +export interface BotDetectionRequestData { + path: string + method: string + headers: Record + ip: string + timestamp: number + userAgent?: string + referer?: string + acceptLanguage?: string + acceptEncoding?: string +} + +export interface BotDetectionDriverOptions { + sessionConfig?: { + password?: string + cookieName?: string + } + ipExtraction?: { + trustProxy?: boolean + proxyHeaders?: string[] + } + debug?: boolean +} \ No newline at end of file diff --git a/libs/is-bot/src/enhanced-analyzer.ts b/libs/is-bot/src/enhanced-analyzer.ts new file mode 100644 index 00000000..dcb8c0a6 --- /dev/null +++ b/libs/is-bot/src/enhanced-analyzer.ts @@ -0,0 +1,543 @@ +// Enhanced bot detection analyzer with strict, context-aware heuristics +import type { H3Event } from 'h3' +import type { BotDetectionBehavior, SessionData } from './behavior' +import type { ImprovedDetectionContext, SiteProfile } from './improved-behavior' +import { getHeaders } from 'h3' +import { + analyzeUserIntent, + buildSiteProfile, + IMPROVED_BEHAVIOR_WEIGHTS, + + scorePathAccess, + + updateCredibilityScore, +} from './improved-behavior' + +// Global site profile (in production, this should be persisted) +const globalSiteProfile: { value: SiteProfile | null } = { value: null } + +// Enhanced bot score thresholds (more strict) +export const ENHANCED_THRESHOLDS = { + DEFINITELY_BOT: 80, // Lowered from 90 - we're more confident now + LIKELY_BOT: 60, // Lowered from 70 - better precision + SUSPICIOUS: 35, // Lowered from 40 - catch more edge cases + PROBABLY_HUMAN: 15, // Lowered from 20 - positive scoring allows this + DEFINITELY_HUMAN: -10, // NEW: Negative scores for highly trusted users +} + +// Enhanced analysis with context awareness +export function enhancedBotAnalysis({ + event, + behavior, + timestamp: _timestamp = Date.now(), + debug: _debug = false, +}: { + event: H3Event + behavior: BotDetectionBehavior + timestamp?: number + debug?: boolean +}): { + botScore: number + confidence: number + factors: Array<{ type: string, score: number, reason: string }> + context: ImprovedDetectionContext + recommendation: 'allow' | 'challenge' | 'block' + } { + const path = event.path || '' + const _method = event.method || 'GET' + const headers = getHeaders(event) + const sessionData = behavior.session + const _ipData = behavior.ip + + // Build/update site profile + globalSiteProfile.value = buildSiteProfile(event, globalSiteProfile.value || undefined) + + // Analyze user intent based on request history + const userIntent = analyzeUserIntent(sessionData.lastRequests) + + // Build detection context + const context: ImprovedDetectionContext = { + userIntent: userIntent as 'browsing' | 'exploring' | 'scanning' | 'exploiting' | 'unknown', + accessPattern: analyzeAccessPattern(sessionData) as 'human-like' | 'systematic' | 'random' | 'malicious', + credibilityScore: sessionData.knownGoodActions * 5, // Convert to 0-100 scale + authenticationStatus: detectAuthStatus(headers, path) as 'authenticated' | 'anonymous' | 'unknown', + referrerContext: analyzeReferrer(headers) as 'internal' | 'search-engine' | 'direct' | 'suspicious', + technicalProfile: analyzeTechnicalProfile(headers, sessionData), + } + + // Enhanced scoring with context awareness + const factors: Array<{ type: string, score: number, reason: string }> = [] + let totalScore = 0 + + // 1. Context-aware path analysis + const pathAnalysis = scorePathAccess(path, globalSiteProfile.value!, context) + if (pathAnalysis.score !== 0) { + factors.push({ + type: 'PATH_ACCESS', + score: pathAnalysis.score, + reason: pathAnalysis.reason, + }) + totalScore += pathAnalysis.score + } + + // 2. Intent-based scoring + const intentScore = scoreUserIntent(context.userIntent, sessionData) + if (intentScore !== 0) { + factors.push({ + type: 'USER_INTENT', + score: intentScore, + reason: `Intent detected as: ${context.userIntent}`, + }) + totalScore += intentScore + } + + // 3. Positive scoring for good behavior + const positiveScore = scorePositiveBehavior(context, sessionData, globalSiteProfile.value!) + if (positiveScore !== 0) { + factors.push({ + type: 'POSITIVE_BEHAVIOR', + score: positiveScore, + reason: 'Legitimate user behavior patterns detected', + }) + totalScore += positiveScore + } + + // 4. Technical profile analysis + const techScore = scoreTechnicalProfile(context.technicalProfile) + if (techScore !== 0) { + factors.push({ + type: 'TECHNICAL_PROFILE', + score: techScore, + reason: 'Technical fingerprint analysis', + }) + totalScore += techScore + } + + // 5. Rate limiting with context + const rateScore = scoreRateLimiting(sessionData, context, globalSiteProfile.value!) + if (rateScore !== 0) { + factors.push({ + type: 'RATE_LIMITING', + score: rateScore, + reason: 'Request rate analysis', + }) + totalScore += rateScore + } + + // 6. Enhanced timing analysis + const timingScore = scoreTimingPatterns(sessionData, context) + if (timingScore !== 0) { + factors.push({ + type: 'TIMING_ANALYSIS', + score: timingScore, + reason: 'Request timing pattern analysis', + }) + totalScore += timingScore + } + + // Update credibility score + const newCredibilityScore = updateCredibilityScore( + context.credibilityScore, + sessionData, + context, + ) + + // Apply credibility bonus/penalty + const credibilityAdjustment = (newCredibilityScore - 50) * 0.2 // -10 to +10 adjustment + totalScore += credibilityAdjustment + + if (credibilityAdjustment !== 0) { + factors.push({ + type: 'CREDIBILITY_ADJUSTMENT', + score: credibilityAdjustment, + reason: `User credibility: ${newCredibilityScore}/100`, + }) + } + + // Calculate confidence based on number of data points + const confidence = calculateConfidence(sessionData, factors.length) + + // Determine recommendation + const recommendation = determineRecommendation(totalScore, confidence, context) + + return { + botScore: Math.round(totalScore * 10) / 10, // Round to 1 decimal + confidence, + factors, + context, + recommendation, + } +} + +function analyzeAccessPattern(sessionData: SessionData): string { + const requests = sessionData.lastRequests + if (requests.length < 3) + return 'unknown' + + // Check for human-like patterns + const hasVariedTiming = checkTimingVariation(requests) + const hasLogicalFlow = checkLogicalFlow(requests) + const hasNaturalErrors = checkNaturalErrorPattern(requests) + + if (hasVariedTiming && hasLogicalFlow && hasNaturalErrors) { + return 'human-like' + } + + // Check for systematic patterns + const isSystematic = checkSystematicAccess(requests) + if (isSystematic) + return 'systematic' + + // Check for random patterns (possible bot) + const isRandom = checkRandomPattern(requests) + if (isRandom) + return 'random' + + return 'unknown' +} + +function checkTimingVariation(requests: Array<{ timestamp: number }>): boolean { + if (requests.length < 3) + return true + + const intervals = [] + for (let i = 1; i < requests.length; i++) { + intervals.push(requests[i].timestamp - requests[i - 1].timestamp) + } + + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + const stdDev = Math.sqrt(variance) + const coefficientOfVariation = stdDev / mean + + // Human timing should have some variation (> 0.3) + return coefficientOfVariation > 0.3 +} + +function checkLogicalFlow(requests: Array<{ path: string }>): boolean { + // Look for logical navigation patterns rather than random access + const paths = requests.map(r => r.path) + + // Check for common logical flows + const logicalPatterns = [ + /^\/$/, // Start at home + /^\/[^/]+$/, // Go to main section + /^\/[^/]+\/[^/]+/, // Go deeper + ] + + // At least 50% should follow some logical pattern + const logicalRequests = paths.filter(path => + logicalPatterns.some(pattern => pattern.test(path)), + ) + + return logicalRequests.length / paths.length > 0.5 +} + +function checkNaturalErrorPattern(requests: Array<{ status?: number }>): boolean { + const errorCount = requests.filter(r => r.status && r.status >= 400).length + const totalRequests = requests.length + + // Natural users have some errors (typos, broken links) but not too many + const errorRate = errorCount / totalRequests + return errorRate > 0.05 && errorRate < 0.3 // 5-30% error rate is natural +} + +function checkSystematicAccess(requests: Array<{ path: string }>): boolean { + const paths = requests.map(r => r.path) + + // Check for sequential patterns + const numbers = paths.map((p) => { + const match = p.match(/\/(\d+)/) + return match ? Number.parseInt(match[1]) : null + }).filter(n => n !== null) + + if (numbers.length >= 3) { + const sorted = [...numbers].sort((a, b) => a - b) + const isSequential = sorted.every((val, i) => i === 0 || val === sorted[i - 1] + 1) + if (isSequential) + return true + } + + return false +} + +function checkRandomPattern(requests: Array<{ path: string }>): boolean { + const paths = requests.map(r => r.path) + const uniquePaths = new Set(paths) + + // Very high unique path ratio with no logical flow suggests random access + return (uniquePaths.size / paths.length) > 0.9 && paths.length > 5 +} + +function detectAuthStatus(headers: Record, path: string): string { + // Check for authentication headers/cookies + const authHeaders = ['authorization', 'cookie', 'x-auth-token'] + const hasAuthHeaders = authHeaders.some(header => headers[header]) + + if (hasAuthHeaders) + return 'authenticated' + if (path.includes('/login') || path.includes('/auth')) + return 'unknown' + return 'anonymous' +} + +function analyzeReferrer(headers: Record): string { + const referrer = headers.referer || headers.referrer || '' + + if (!referrer) + return 'direct' + + if (referrer.includes('google.com') + || referrer.includes('bing.com') + || referrer.includes('duckduckgo.com')) { + return 'search-engine' + } + + // Check if internal referrer + try { + const referrerUrl = new URL(referrer) + const currentHost = headers.host + if (referrerUrl.hostname === currentHost) { + return 'internal' + } + } + catch {} + + return 'external' +} + +function analyzeTechnicalProfile(headers: Record, sessionData: SessionData) { + const _userAgent = headers['user-agent'] || '' + const acceptLanguage = headers['accept-language'] || '' + const acceptEncoding = headers['accept-encoding'] || '' + + const browserFeatures = [] + if (acceptLanguage) + browserFeatures.push('language') + if (acceptEncoding.includes('gzip')) + browserFeatures.push('compression') + if (headers['sec-ch-ua']) + browserFeatures.push('client-hints') + + const networkConsistency = calculateNetworkConsistency(sessionData) + const headerCredibility = calculateHeaderCredibility(headers) + + return { + browserFeatures, + networkConsistency, + headerCredibility, + } +} + +function calculateNetworkConsistency(sessionData: SessionData): number { + // Analyze if timing patterns suggest same network/client + const requests = sessionData.lastRequests + if (requests.length < 5) + return 0.5 // Neutral + + const intervals = [] + for (let i = 1; i < requests.length; i++) { + intervals.push(requests[i].timestamp - requests[i - 1].timestamp) + } + + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + + // Consistent network should have some baseline timing + return Math.min(1, variance / 1000) // Normalize to 0-1 +} + +function calculateHeaderCredibility(headers: Record): number { + let credibility = 0.5 // Start neutral + + const userAgent = headers['user-agent'] || '' + const acceptLanguage = headers['accept-language'] || '' + + // Positive indicators + if (userAgent.includes('Mozilla/') && userAgent.includes('Chrome/')) + credibility += 0.2 + if (acceptLanguage.includes(',')) + credibility += 0.1 // Multiple languages + if (headers['sec-ch-ua']) + credibility += 0.1 // Modern browser + if (headers['accept-encoding']?.includes('br')) + credibility += 0.1 // Brotli support + + // Negative indicators + if (!userAgent) + credibility -= 0.3 + if (userAgent.length < 20) + credibility -= 0.2 // Too short + if (!headers.accept) + credibility -= 0.2 + + return Math.max(0, Math.min(1, credibility)) +} + +function scoreUserIntent(intent: string, _sessionData: SessionData): number { + switch (intent) { + case 'browsing': + return IMPROVED_BEHAVIOR_WEIGHTS.GOOD_NAVIGATION + case 'exploring': + return 0 // Neutral - legitimate exploration + case 'scanning': + return IMPROVED_BEHAVIOR_WEIGHTS.SYSTEMATIC_ENUMERATION + case 'exploiting': + return IMPROVED_BEHAVIOR_WEIGHTS.VULNERABILITY_PROBE + default: + return 0 + } +} + +function scorePositiveBehavior( + context: ImprovedDetectionContext, + sessionData: SessionData, + _siteProfile: SiteProfile, +): number { + let positiveScore = 0 + + // Reward legitimate referrers + if (context.referrerContext === 'search-engine') { + positiveScore += IMPROVED_BEHAVIOR_WEIGHTS.LEGITIMATE_REFERRER + } + + // Reward proper headers + if (context.technicalProfile.headerCredibility > 0.7) { + positiveScore += IMPROVED_BEHAVIOR_WEIGHTS.PROPER_HEADERS + } + + // Reward authenticated access + if (context.authenticationStatus === 'authenticated') { + positiveScore += IMPROVED_BEHAVIOR_WEIGHTS.AUTHENTICATED_ACCESS + } + + // Reward content engagement (time between requests suggests reading) + const avgInterval = sessionData.averageTimeBetweenRequests || 0 + if (avgInterval > 10000 && avgInterval < 120000) { // 10s - 2min suggests reading + positiveScore += IMPROVED_BEHAVIOR_WEIGHTS.CONTENT_ENGAGEMENT + } + + return positiveScore +} + +function scoreTechnicalProfile(profile: any): number { + let score = 0 + + if (profile.headerCredibility < 0.3) { + score += IMPROVED_BEHAVIOR_WEIGHTS.SUSPICIOUS_USER_AGENT + } + + if (profile.browserFeatures.length < 2) { + score += 10 // Penalty for minimal browser features + } + + return score +} + +function scoreRateLimiting( + sessionData: SessionData, + context: ImprovedDetectionContext, + siteProfile: SiteProfile, +): number { + const now = Date.now() + const oneMinuteAgo = now - 60000 + const requestsLastMinute = sessionData.lastRequests.filter(r => r.timestamp > oneMinuteAgo).length + + // Context-aware rate limits + let rateLimit = 15 // Default + + if (context.authenticationStatus === 'authenticated') + rateLimit = 30 + if (siteProfile.apiEndpoints.length > 0 && context.userIntent === 'browsing') + rateLimit = 25 + if (context.userIntent === 'scanning') + rateLimit = 5 // Very strict for scanners + + if (requestsLastMinute > rateLimit) { + const overage = requestsLastMinute - rateLimit + return Math.min(IMPROVED_BEHAVIOR_WEIGHTS.API_ABUSE, overage * 5) + } + + return 0 +} + +function scoreTimingPatterns(sessionData: SessionData, context: ImprovedDetectionContext): number { + if (sessionData.lastRequests.length < 5) + return 0 + + const intervals = [] + for (let i = 1; i < sessionData.lastRequests.length; i++) { + const interval = sessionData.lastRequests[i].timestamp - sessionData.lastRequests[i - 1].timestamp + intervals.push(interval) + } + + const mean = intervals.reduce((sum, val) => sum + val, 0) / intervals.length + const variance = intervals.reduce((sum, val) => sum + (val - mean) ** 2, 0) / intervals.length + const stdDev = Math.sqrt(variance) + const coefficientOfVariation = stdDev / mean + + // Only penalize if clearly robotic AND intent is suspicious + if (coefficientOfVariation < 0.05 && mean < 2000 && context.userIntent === 'scanning') { + return 30 // High penalty for robotic timing + suspicious intent + } + + // Very strict for exploitation + if (coefficientOfVariation < 0.1 && context.userIntent === 'exploiting') { + return 40 + } + + return 0 +} + +function calculateConfidence(sessionData: SessionData, factorCount: number): number { + // Confidence based on amount of data and number of detection factors + const requestCount = sessionData.lastRequests.length + const timeSpan = Date.now() - sessionData.firstSeenAt + + let confidence = 0 + + // More requests = higher confidence + confidence += Math.min(50, requestCount * 5) + + // Longer observation = higher confidence + confidence += Math.min(30, timeSpan / (60000)) // Minutes observed + + // More detection factors = higher confidence + confidence += Math.min(20, factorCount * 4) + + return Math.min(100, confidence) +} + +function determineRecommendation( + score: number, + confidence: number, + _context: ImprovedDetectionContext, +): 'allow' | 'challenge' | 'block' { + // High confidence decisions + if (confidence > 80) { + if (score >= ENHANCED_THRESHOLDS.DEFINITELY_BOT) + return 'block' + if (score >= ENHANCED_THRESHOLDS.LIKELY_BOT) + return 'challenge' + if (score <= ENHANCED_THRESHOLDS.DEFINITELY_HUMAN) + return 'allow' + } + + // Medium confidence - be more conservative + if (confidence > 50) { + if (score >= ENHANCED_THRESHOLDS.DEFINITELY_BOT + 10) + return 'block' + if (score >= ENHANCED_THRESHOLDS.LIKELY_BOT + 5) + return 'challenge' + } + + // Low confidence - mostly allow with some challenges + if (score >= ENHANCED_THRESHOLDS.DEFINITELY_BOT + 20) + return 'block' + if (score >= ENHANCED_THRESHOLDS.LIKELY_BOT + 15) + return 'challenge' + + return 'allow' +} + +export { globalSiteProfile } diff --git a/libs/is-bot/src/improved-behavior.ts b/libs/is-bot/src/improved-behavior.ts new file mode 100644 index 00000000..f79ce3d7 --- /dev/null +++ b/libs/is-bot/src/improved-behavior.ts @@ -0,0 +1,303 @@ +// Improved bot detection with context-aware heuristics +import type { H3Event } from 'h3' +import { getHeaders, getResponseStatus } from 'h3' + +// Smart path analysis that adapts to the actual site +export interface SiteProfile { + detectedCMS?: 'wordpress' | 'drupal' | 'nuxt' | 'next' | 'unknown' + hasAdminArea: boolean + adminPaths: string[] + apiEndpoints: string[] + existingPaths: Set + userAgentPatterns: Map + legitimateAccessPatterns: string[] +} + +// Context-aware scoring that considers intent +export interface ImprovedDetectionContext { + userIntent: 'browsing' | 'exploring' | 'scanning' | 'exploiting' | 'unknown' + accessPattern: 'human-like' | 'systematic' | 'random' | 'malicious' + credibilityScore: number // 0-100, builds over time + authenticationStatus: 'authenticated' | 'anonymous' | 'unknown' + referrerContext: 'internal' | 'search-engine' | 'direct' | 'suspicious' + technicalProfile: { + browserFeatures: string[] + networkConsistency: number + headerCredibility: number + } +} + +// Improved scoring that rewards good behavior +export const IMPROVED_BEHAVIOR_WEIGHTS = { + // Positive factors (reduce bot score) + GOOD_NAVIGATION: -10, // Following logical navigation paths + CONTENT_ENGAGEMENT: -15, // Time spent reading content + LEGITIMATE_REFERRER: -5, // Coming from search engines/legitimate sites + PROPER_HEADERS: -8, // Complete, consistent header set + AUTHENTICATED_ACCESS: -20, // Successfully authenticated users + + // Negative factors (increase bot score) + NONEXISTENT_PATH_SCAN: 25, // Scanning for paths that don't exist + CREDENTIAL_STUFFING: 50, // Multiple login attempts + VULNERABILITY_PROBE: 40, // Testing for known vulnerabilities + RAPID_ERROR_GENERATION: 30, // Generating many errors quickly + SUSPICIOUS_USER_AGENT: 20, // User agent inconsistencies + + // Context-dependent factors + ADMIN_ACCESS_UNAUTHENTICATED: 35, // Accessing admin without auth + API_ABUSE: 25, // Excessive API calls without proper usage + SYSTEMATIC_ENUMERATION: 30, // Clear enumeration patterns +} + +// Intent recognition based on request patterns +export function analyzeUserIntent(requests: Array<{ path: string, timestamp: number, status?: number }>): string { + if (requests.length < 3) + return 'unknown' + + const paths = requests.map(r => r.path) + const recentPaths = paths.slice(-10) // Last 10 requests + + // Check for logical navigation patterns + const hasLogicalProgression = checkLogicalProgression(recentPaths) + if (hasLogicalProgression) + return 'browsing' + + // Check for systematic scanning + const isSystematic = checkSystematicPattern(recentPaths) + if (isSystematic) + return 'scanning' + + // Check for exploitation attempts + const isExploiting = checkExploitationPattern(recentPaths) + if (isExploiting) + return 'exploiting' + + // Check for curious exploration (legitimate) + const isExploring = checkExplorationPattern(recentPaths) + if (isExploring) + return 'exploring' + + return 'unknown' +} + +function checkLogicalProgression(paths: string[]): boolean { + // Look for human-like navigation: + // - Home -> category -> article + // - Search -> results -> details + // - Navigation menu following + + const navigationPatterns = [ + ['/', '/blog', '/blog/'], // Home to blog + ['/', '/products', '/products/'], // Home to products + ['/search', '/article/', '/'], // Search to content + ] + + return navigationPatterns.some(pattern => + pattern.every((step, i) => i >= paths.length || paths[i].includes(step)), + ) +} + +function checkSystematicPattern(paths: string[]): boolean { + // Detect systematic scanning (high confidence bot behavior): + // - Sequential numeric IDs: /user/1, /user/2, /user/3 + // - Alphabetical enumeration: /admin, /backup, /config + // - Extension testing: /index.php, /index.asp, /index.html + + // Check for numeric sequence scanning + const numericMatches = paths + .map(p => p.match(/\/(\d+)(?:\/|$)/)) + .filter(Boolean) + .map(m => Number.parseInt(m![1])) + + if (numericMatches.length >= 3) { + const sorted = [...numericMatches].sort((a, b) => a - b) + const isSequential = sorted.every((val, i) => i === 0 || val === sorted[i - 1] + 1) + if (isSequential) + return true + } + + // Check for alphabetical scanning + const pathBases = paths.map(p => p.split('/').pop()?.split('?')[0]).filter(Boolean) + if (pathBases.length >= 4) { + const sorted = [...pathBases].sort() + const isAlphabetical = pathBases.join('') === sorted.join('') + if (isAlphabetical) + return true + } + + return false +} + +function checkExploitationPattern(paths: string[]): boolean { + // High-confidence malicious patterns: + // - SQL injection attempts + // - XSS probe attempts + // - Directory traversal + // - Known vulnerability scanners + + const exploitPatterns = [ + /['"]\s*(union|select|insert|update|delete)/i, // SQL injection + / + exploitPatterns.some(pattern => pattern.test(path)), + ) +} + +function checkExplorationPattern(paths: string[]): boolean { + // Legitimate user exploration: + // - Checking multiple sections of site + // - Following links and references + // - Reasonable time between requests + // - Mixture of successful and failed requests (normal 404s) + + const uniqueSections = new Set( + paths.map(p => `/${p.split('/')[1]}`).filter(Boolean), + ) + + // Exploring multiple sections is human-like + return uniqueSections.size >= 3 && paths.length >= 5 +} + +// Site profiling to understand what's legitimate for THIS site +export function buildSiteProfile(event: H3Event, existingProfile?: SiteProfile): SiteProfile { + const profile = existingProfile || { + detectedCMS: 'unknown', + hasAdminArea: false, + adminPaths: [], + apiEndpoints: [], + existingPaths: new Set(), + userAgentPatterns: new Map(), + legitimateAccessPatterns: [], + } + + const path = event.path || '' + const status = getResponseStatus(event) + const headers = getHeaders(event) + const userAgent = headers['user-agent'] || '' + + // Track existing paths (200 responses) + if (status >= 200 && status < 300) { + profile.existingPaths.add(path) + + // Detect CMS type based on successful responses + if (path.includes('/wp-') || path.includes('wp-admin')) { + profile.detectedCMS = 'wordpress' + } + else if (path.includes('/_nuxt/')) { + profile.detectedCMS = 'nuxt' + } + else if (path.includes('/_next/')) { + profile.detectedCMS = 'next' + } + + // Detect admin areas + if (path.includes('/admin') || path.includes('/dashboard')) { + profile.hasAdminArea = true + profile.adminPaths.push(path) + } + + // Detect API endpoints + if (path.includes('/api/') || path.includes('.json')) { + profile.apiEndpoints.push(path) + } + } + + // Track legitimate user agent patterns + if (userAgent && status < 400) { + const count = profile.userAgentPatterns.get(userAgent) || 0 + profile.userAgentPatterns.set(userAgent, count + 1) + } + + return profile +} + +// Context-aware path scoring +export function scorePathAccess( + path: string, + profile: SiteProfile, + context: ImprovedDetectionContext, +): { score: number, reason: string } { + // If path exists on site and user is exploring legitimately, minimal penalty + if (profile.existingPaths.has(path) && context.userIntent === 'browsing') { + return { score: 0, reason: 'legitimate-access' } + } + + // If authenticated user accessing admin area, no penalty + if (context.authenticationStatus === 'authenticated' + && profile.adminPaths.some(ap => path.startsWith(ap))) { + return { score: 0, reason: 'authenticated-admin-access' } + } + + // High penalty for scanning non-existent paths + if (!profile.existingPaths.has(path) && context.userIntent === 'scanning') { + return { score: 40, reason: 'nonexistent-path-scanning' } + } + + // Very high penalty for exploitation attempts + if (context.userIntent === 'exploiting') { + return { score: 60, reason: 'exploitation-attempt' } + } + + // Moderate penalty for unauthenticated admin access + if (context.authenticationStatus === 'anonymous' + && profile.adminPaths.some(ap => path.startsWith(ap))) { + return { score: 25, reason: 'unauthenticated-admin-attempt' } + } + + // Check against CMS-specific patterns only if that CMS is detected + if (profile.detectedCMS === 'wordpress' && isWordPressVulnerabilityPath(path)) { + return { score: 35, reason: 'wordpress-vulnerability-probe' } + } + + // Default: small penalty for accessing non-existent paths + if (!profile.existingPaths.has(path)) { + return { score: 10, reason: 'nonexistent-path' } + } + + return { score: 0, reason: 'normal-access' } +} + +function isWordPressVulnerabilityPath(path: string): boolean { + const wpVulnPaths = [ + '/wp-config.php', + '/wp-config.php.bak', + '/wp-admin/install.php', + '/.wp-config.php.swp', + ] + return wpVulnPaths.some(vp => path.includes(vp)) +} + +// Credibility scoring that builds trust over time +export function updateCredibilityScore( + currentScore: number, + sessionData: any, + context: ImprovedDetectionContext, +): number { + let newScore = currentScore + + // Positive actions that build credibility + if (context.userIntent === 'browsing') + newScore += 2 + if (context.accessPattern === 'human-like') + newScore += 3 + if (context.authenticationStatus === 'authenticated') + newScore += 5 + if (context.referrerContext === 'search-engine') + newScore += 1 + + // Negative actions that reduce credibility + if (context.userIntent === 'scanning') + newScore -= 10 + if (context.userIntent === 'exploiting') + newScore -= 20 + if (context.accessPattern === 'malicious') + newScore -= 15 + + // Cap at 0-100 + return Math.max(0, Math.min(100, newScore)) +} diff --git a/libs/is-bot/src/index.ts b/libs/is-bot/src/index.ts new file mode 100644 index 00000000..4ff46526 --- /dev/null +++ b/libs/is-bot/src/index.ts @@ -0,0 +1,75 @@ +// Main exports for the bot detection library +export type { + BotDetectionConfig, + BotDetectionRequest, + BotDetectionResponse, + BotDetectionStorage, + SessionData, + IPData, + SiteProfile, + DetectionContext, + SessionIdentifier, + ResponseStatusProvider, + BehaviorConfiguration +} from './types' + +export { BotDetectionEngine } from './core' + +// Storage adapters +export { UnstorageAdapter } from './adapters/unstorage' +export { MemoryAdapter } from './adapters/memory' +export { UnstorageBehaviorAdapter, type BehaviorStorage } from './adapters/behavior-storage' + +// H3/Nuxt adapters +export { + h3ToBotDetectionRequest, + H3SessionIdentifier, + H3RealSessionIdentifier, + H3ResponseStatusProvider, + createTrackedBotDetectionRequest +} from './adapters/h3' + +// Behavior system (still available for direct use) +export { + modularBotAnalysis, + setBehaviorConfig, + DEFAULT_BEHAVIOR_CONFIG, + type BotDetectionBehaviorConfig +} from './modular-analyzer' + +// Individual behaviors (for custom implementations) +export * from './behaviors' + +// Utility functions for common bot detection patterns +export function isBotUserAgent(userAgent: string): boolean { + const botPatterns = [ + /bot/i, + /crawler/i, + /spider/i, + /scraper/i, + /curl/i, + /wget/i, + /python-requests/i + ] + + return botPatterns.some(pattern => pattern.test(userAgent)) +} + +export function isSensitivePath(path: string): boolean { + const sensitivePaths = [ + '/wp-login', + '/xmlrpc.php', + '/.env', + '/phpmyadmin', + '/admin', + '/wp-admin' + ] + + return sensitivePaths.some(sensitive => path.includes(sensitive)) +} + +export function isValidUserAgent(userAgent: string): boolean { + return userAgent.length >= 20 && + userAgent.includes('Mozilla') && + (userAgent.includes('Chrome') || userAgent.includes('Firefox') || userAgent.includes('Safari')) +} \ No newline at end of file diff --git a/libs/is-bot/src/modular-analyzer.ts b/libs/is-bot/src/modular-analyzer.ts new file mode 100644 index 00000000..008a44d0 --- /dev/null +++ b/libs/is-bot/src/modular-analyzer.ts @@ -0,0 +1,262 @@ +// Modular bot detection analyzer - framework agnostic +import type { H3Event } from 'h3' +import { getHeaders } from 'h3' +import type { DetectionContext } from './types' +import type { BotDetectionBehaviorConfig } from './behaviors' +import type { BotDetectionBehavior } from './behavior' +import { + analyzeAdvancedIntent, + analyzeAdvancedPositiveSignals, + // Advanced behaviors + analyzeAdvancedTiming, + analyzeBasicPositiveSignals, + analyzeBasicRateLimit, + analyzeBasicTiming, + + analyzeBasicUserAgent, + analyzeBehavioralCredibility, + analyzeBrowserFingerprint, + + // Intermediate behaviors + analyzeBurstPattern, + analyzeContextualRateLimit, + analyzeHeaderConsistency, + // Simple behaviors + analyzePathAccess, + analyzeSimplePatterns, + + // Configuration + DEFAULT_BEHAVIOR_CONFIG, +} from './behaviors' +import { buildBasicSiteProfile } from './behaviors/path-analysis' + +// Global site profile and config +let globalSiteProfile: any = null +let behaviorConfig = DEFAULT_BEHAVIOR_CONFIG + +/** + * Modular bot analysis - choose which behaviors to enable + */ +export function modularBotAnalysis({ + event, + behavior, + config = behaviorConfig, + debug: _debug = false, +}: { + event: H3Event + behavior: BotDetectionBehavior + config?: BotDetectionBehaviorConfig + debug?: boolean +}): { + botScore: number + confidence: number + factors: Array<{ type: string, score: number, reason: string }> + recommendation: 'allow' | 'challenge' | 'block' + } { + const path = event.path || '' + const headers = getHeaders(event) + const sessionData = behavior.session + + // Build simple site profile + globalSiteProfile = buildBasicSiteProfile(event, globalSiteProfile) + + // Build basic context (without complex analysis) + const context: DetectionContext = { + userIntent: 'unknown' as 'browsing' | 'exploring' | 'scanning' | 'exploiting' | 'unknown', + accessPattern: 'unknown' as 'human-like' | 'systematic' | 'random' | 'malicious', + credibilityScore: sessionData.knownGoodActions * 5, + authenticationStatus: detectSimpleAuthStatus(headers, path) as 'authenticated' | 'anonymous' | 'unknown', + referrerContext: analyzeSimpleReferrer(headers) as 'internal' | 'search-engine' | 'direct' | 'suspicious', + technicalProfile: { + browserFeatures: [], + networkConsistency: 0.5, + headerCredibility: 0.5, + }, + } + + const factors: Array<{ type: string, score: number, reason: string }> = [] + let totalScore = 0 + + // Apply simple behaviors + if (config.simple.pathAnalysis.enabled) { + const result = analyzePathAccess(path, context) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.pathAnalysis.weight + factors.push({ type: 'PATH_ANALYSIS', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.simple.basicTiming.enabled) { + const result = analyzeBasicTiming(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.basicTiming.weight + factors.push({ type: 'BASIC_TIMING', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.simple.basicRateLimit.enabled) { + const result = analyzeBasicRateLimit(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.basicRateLimit.weight + factors.push({ type: 'BASIC_RATE_LIMIT', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.simple.basicUserAgent.enabled) { + const result = analyzeBasicUserAgent(headers) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.basicUserAgent.weight + factors.push({ type: 'BASIC_USER_AGENT', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.simple.simplePatterns.enabled) { + const result = analyzeSimplePatterns(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.simplePatterns.weight + factors.push({ type: 'SIMPLE_PATTERNS', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.simple.basicPositiveSignals.enabled) { + const result = analyzeBasicPositiveSignals(headers, sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.simple.basicPositiveSignals.weight + factors.push({ type: 'BASIC_POSITIVE_SIGNALS', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + // Apply intermediate behaviors if enabled + if (config.intermediate.burstDetection.enabled) { + const result = analyzeBurstPattern(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.intermediate.burstDetection.weight + factors.push({ type: 'BURST_DETECTION', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.intermediate.headerConsistency.enabled) { + const result = analyzeHeaderConsistency(headers) + if (result.score !== 0) { + const adjustedScore = result.score * config.intermediate.headerConsistency.weight + factors.push({ type: 'HEADER_CONSISTENCY', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.intermediate.contextualRateLimit.enabled) { + const result = analyzeContextualRateLimit(sessionData, context) + if (result.score !== 0) { + const adjustedScore = result.score * config.intermediate.contextualRateLimit.weight + factors.push({ type: 'CONTEXTUAL_RATE_LIMIT', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + // Apply advanced behaviors if enabled (use with caution) + if (config.advanced.advancedTiming.enabled) { + const result = analyzeAdvancedTiming(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.advanced.advancedTiming.weight + factors.push({ type: 'ADVANCED_TIMING', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.advanced.advancedIntent.enabled) { + const result = analyzeAdvancedIntent(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.advanced.advancedIntent.weight + factors.push({ type: 'ADVANCED_INTENT', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.advanced.browserFingerprint.enabled) { + const result = analyzeBrowserFingerprint(headers, sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.advanced.browserFingerprint.weight + factors.push({ type: 'BROWSER_FINGERPRINT', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.advanced.advancedPositiveSignals.enabled) { + const result = analyzeAdvancedPositiveSignals(headers, sessionData, context) + if (result.score !== 0) { + const adjustedScore = result.score * config.advanced.advancedPositiveSignals.weight + factors.push({ type: 'ADVANCED_POSITIVE_SIGNALS', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + if (config.advanced.behavioralCredibility.enabled) { + const result = analyzeBehavioralCredibility(sessionData) + if (result.score !== 0) { + const adjustedScore = result.score * config.advanced.behavioralCredibility.weight + factors.push({ type: 'BEHAVIORAL_CREDIBILITY', score: adjustedScore, reason: result.reason }) + totalScore += adjustedScore + } + } + + // Calculate confidence based on number of active behaviors + const activeFactors = factors.length + const requestCount = sessionData.lastRequests.length + const confidence = Math.min(100, (activeFactors * 10) + (requestCount * 3)) + + // Simple thresholds + const recommendation = determineSimpleRecommendation(totalScore, confidence) + + return { + botScore: Math.round(totalScore * 10) / 10, + confidence, + factors, + recommendation, + } +} + +// Simple helper functions +function detectSimpleAuthStatus(headers: Record, path: string): string { + const authHeaders = ['authorization', 'cookie'] + if (authHeaders.some(header => headers[header])) + return 'authenticated' + if (path.includes('/login') || path.includes('/auth')) + return 'unknown' + return 'anonymous' +} + +function analyzeSimpleReferrer(headers: Record): string { + const referrer = headers.referer || headers.referrer || '' + if (!referrer) + return 'direct' + if (referrer.includes('google.com') || referrer.includes('bing.com')) + return 'search-engine' + return 'external' +} + +function determineSimpleRecommendation(score: number, _confidence: number): 'allow' | 'challenge' | 'block' { + // Conservative thresholds + if (score >= 50) + return 'block' + if (score >= 30) + return 'challenge' + return 'allow' +} + +// Configuration setter +export function setBehaviorConfig(config: Partial) { + behaviorConfig = { + simple: { ...behaviorConfig.simple, ...config.simple }, + intermediate: { ...behaviorConfig.intermediate, ...config.intermediate }, + advanced: { ...behaviorConfig.advanced, ...config.advanced }, + } +} + +export { type BotDetectionBehaviorConfig, DEFAULT_BEHAVIOR_CONFIG } diff --git a/libs/is-bot/src/types.ts b/libs/is-bot/src/types.ts new file mode 100644 index 00000000..d61dc128 --- /dev/null +++ b/libs/is-bot/src/types.ts @@ -0,0 +1,131 @@ +// Core types for the bot detection library +// Framework-agnostic types + +export interface BotDetectionRequest { + path: string + method: string + headers: Record + ip: string + timestamp?: number +} + +export interface BotDetectionResponse { + isBot: boolean + confidence: number + score: number + factors: Array<{ + type: string + score: number + reason: string + }> + recommendation: 'allow' | 'challenge' | 'block' + sessionId: string +} + +export interface SessionData { + id: string + lastRequests: Array<{ + timestamp: number + path: string + status?: number + timeSincePrevious?: number + method?: string + }> + suspiciousPathHits: number + maybeSensitivePathHits: number + uniqueSensitivePathsAccessed: string[] + errorCount: number + score: number + lastScore: number + lastUpdated: number + knownGoodActions: number + requestMethodVariety: string[] + averageTimeBetweenRequests?: number + requestSequenceEntropy: number + firstSeenAt: number + behaviorChangePoints?: number[] +} + +export interface IPData { + sessionCount: number + activeSessions: string[] + suspiciousScore: number + lastUpdated: number + legitSessionsCount: number + sessionsPerHour?: number + lastSessionCreated?: number + isBot?: boolean + isBotConfidence?: number + details?: { name: string, type: string, trusted?: boolean } | null + factors: string[] +} + +export interface SiteProfile { + detectedCMS?: 'wordpress' | 'drupal' | 'nuxt' | 'next' | 'unknown' + hasAdminArea: boolean + adminPaths: string[] + apiEndpoints: string[] + existingPaths: Set + userAgentPatterns: Map + legitimateAccessPatterns: string[] +} + +export interface DetectionContext { + userIntent: 'browsing' | 'exploring' | 'scanning' | 'exploiting' | 'unknown' + accessPattern: 'human-like' | 'systematic' | 'random' | 'malicious' + credibilityScore: number + authenticationStatus: 'authenticated' | 'anonymous' | 'unknown' + referrerContext: 'internal' | 'search-engine' | 'direct' | 'suspicious' + technicalProfile: { + browserFeatures: string[] + networkConsistency: number + headerCredibility: number + } +} + +// Storage interface - framework agnostic +export interface BotDetectionStorage { + getSession(sessionId: string): Promise + setSession(sessionId: string, data: SessionData): Promise + getIP(ip: string): Promise + setIP(ip: string, data: IPData): Promise + getSiteProfile(): Promise + setSiteProfile(profile: SiteProfile): Promise + cleanup?(): Promise +} + +// Configuration interface +export interface BotDetectionConfig { + session?: { + ttl?: number + maxSessionsPerIP?: number + } + thresholds?: { + definitelyBot?: number + likelyBot?: number + suspicious?: number + } + customSensitivePaths?: string[] + ipFilter?: { + trustedIPs?: string[] + blockedIPs?: string[] + } + debug?: boolean + behaviors?: BehaviorConfiguration +} + +export interface BehaviorConfiguration { + simple?: Record + intermediate?: Record + advanced?: Record +} + +// Session identifier interface - allows for different session strategies +export interface SessionIdentifier { + getSessionId(request: BotDetectionRequest): Promise | string +} + +// Response status interface - allows framework to provide response status +export interface ResponseStatusProvider { + getStatus(request: BotDetectionRequest): number | undefined +} \ No newline at end of file diff --git a/libs/is-bot/test/adapters.test.ts b/libs/is-bot/test/adapters.test.ts new file mode 100644 index 00000000..4e03acb2 --- /dev/null +++ b/libs/is-bot/test/adapters.test.ts @@ -0,0 +1,256 @@ +import { describe, it, expect, beforeEach } from 'vitest' +import { MemoryAdapter } from '../src/adapters/memory' +import { H3SessionIdentifier } from '../src/adapters/h3' +import type { SessionData, IPData } from '../src/behavior' +import type { BotDetectionRequest } from '../src/types' +import { TrafficType } from '../src/behavior' + +describe('Storage Adapters', () => { + describe('MemoryAdapter', () => { + let adapter: MemoryAdapter + + beforeEach(() => { + adapter = new MemoryAdapter({ ttl: 1000 }) // 1 second TTL for testing + }) + + it('should store and retrieve session data', async () => { + const sessionData: SessionData = { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: Date.now(), + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: Date.now(), + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + + await adapter.setSession('test-session', sessionData) + const retrieved = await adapter.getSession('test-session') + + expect(retrieved).toEqual(sessionData) + }) + + it('should store and retrieve IP data', async () => { + const ipData: IPData = { + sessionCount: 1, + activeSessions: ['test-session'], + suspiciousScore: 0, + lastUpdated: Date.now(), + legitSessionsCount: 1, + factores: [], + isBot: false, + isBotConfidence: 0, + lastSessionCreated: Date.now() + } + + await adapter.setIP('192.168.1.1', ipData) + const retrieved = await adapter.getIP('192.168.1.1') + + expect(retrieved).toEqual(ipData) + }) + + it('should handle TTL expiration', async () => { + const sessionData: SessionData = { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: Date.now() - 2000, // 2 seconds ago (expired) + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: Date.now() - 2000, + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + + await adapter.setSession('expired-session', sessionData) + const retrieved = await adapter.getSession('expired-session') + + expect(retrieved).toBeNull() + }) + + it('should cleanup expired data', async () => { + const oldSessionData: SessionData = { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: Date.now() - 2000, + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: Date.now() - 2000, + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + + const newSessionData: SessionData = { + ...oldSessionData, + lastUpdated: Date.now() + } + + await adapter.setSession('old-session', oldSessionData) + await adapter.setSession('new-session', newSessionData) + + await adapter.cleanup() + + expect(await adapter.getSession('old-session')).toBeNull() + expect(await adapter.getSession('new-session')).toBeTruthy() + }) + + it('should provide stats', () => { + const stats = adapter.getStats() + + expect(stats).toHaveProperty('sessions') + expect(stats).toHaveProperty('ips') + expect(stats).toHaveProperty('hasSiteProfile') + expect(typeof stats.sessions).toBe('number') + expect(typeof stats.ips).toBe('number') + expect(typeof stats.hasSiteProfile).toBe('boolean') + }) + + it('should clear all data', async () => { + const sessionData: SessionData = { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: Date.now(), + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: Date.now(), + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + + await adapter.setSession('test-session', sessionData) + adapter.clear() + + expect(await adapter.getSession('test-session')).toBeNull() + expect(adapter.getStats().sessions).toBe(0) + }) + }) + + describe('H3SessionIdentifier', () => { + let identifier: H3SessionIdentifier + + beforeEach(() => { + identifier = new H3SessionIdentifier('test-secret') + }) + + it('should generate consistent session IDs for same input', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 test browser' + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const sessionId1 = await identifier.getSessionId(request) + const sessionId2 = await identifier.getSessionId(request) + + expect(sessionId1).toBe(sessionId2) + expect(typeof sessionId1).toBe('string') + expect(sessionId1.length).toBeGreaterThan(0) + }) + + it('should generate different session IDs for different IPs', async () => { + const request1: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 test browser' + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const request2: BotDetectionRequest = { + ...request1, + ip: '192.168.1.2' + } + + const sessionId1 = await identifier.getSessionId(request1) + const sessionId2 = await identifier.getSessionId(request2) + + expect(sessionId1).not.toBe(sessionId2) + }) + + it('should generate different session IDs for different user agents', async () => { + const request1: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 Firefox' + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const request2: BotDetectionRequest = { + ...request1, + headers: { + 'user-agent': 'Mozilla/5.0 Chrome' + } + } + + const sessionId1 = await identifier.getSessionId(request1) + const sessionId2 = await identifier.getSessionId(request2) + + expect(sessionId1).not.toBe(sessionId2) + }) + + it('should handle missing user agent', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: {}, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const sessionId = await identifier.getSessionId(request) + + expect(typeof sessionId).toBe('string') + expect(sessionId.length).toBeGreaterThan(0) + }) + + it('should handle array user agent headers', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': ['Mozilla/5.0 first', 'Mozilla/5.0 second'] + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const sessionId = await identifier.getSessionId(request) + + expect(typeof sessionId).toBe('string') + expect(sessionId.length).toBeGreaterThan(0) + }) + }) +}) \ No newline at end of file diff --git a/libs/is-bot/test/behaviors.test.ts b/libs/is-bot/test/behaviors.test.ts new file mode 100644 index 00000000..c1724520 --- /dev/null +++ b/libs/is-bot/test/behaviors.test.ts @@ -0,0 +1,285 @@ +import { describe, it, expect, beforeEach } from 'vitest' +import { + analyzeBasicUserAgent, + analyzeBasicTiming, + analyzeBasicRateLimit, + analyzePathAccess, + analyzeHeaderConsistency, + analyzeBurstPattern, + DEFAULT_BEHAVIOR_CONFIG +} from '../src/behaviors' +import type { SessionData } from '../src/behavior' +import { TrafficType } from '../src/behavior' + +describe('Behavior Analysis Functions', () => { + let mockSessionData: SessionData + + beforeEach(() => { + mockSessionData = { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + score: 0, + lastScore: 0, + lastUpdated: Date.now(), + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: Date.now(), + behaviorChangePoints: [], + trafficType: TrafficType.UNKNOWN + } + }) + + describe('analyzeBasicUserAgent', () => { + it('should detect bot user agents', () => { + const headers = { 'user-agent': 'curl/7.68.0' } + const result = analyzeBasicUserAgent(headers) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('suspicious-user-agent-length') + }) + + it('should accept legitimate user agents', () => { + const headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + const result = analyzeBasicUserAgent(headers) + + expect(result.score).toBe(0) + }) + + it('should penalize missing user agent', () => { + const headers = {} + const result = analyzeBasicUserAgent(headers) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('missing') + }) + + it('should penalize short user agent', () => { + const headers = { 'user-agent': 'Bot' } + const result = analyzeBasicUserAgent(headers) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('suspicious-user-agent-length') + }) + }) + + describe('analyzeBasicTiming', () => { + it('should detect rapid requests', () => { + const now = Date.now() + mockSessionData.lastRequests = [ + { timestamp: now - 500, path: '/page1', timeSincePrevious: 0, method: 'GET' }, + { timestamp: now - 400, path: '/page2', timeSincePrevious: 100, method: 'GET' }, + { timestamp: now - 300, path: '/page3', timeSincePrevious: 100, method: 'GET' }, + { timestamp: now - 200, path: '/page4', timeSincePrevious: 100, method: 'GET' }, + { timestamp: now - 100, path: '/page5', timeSincePrevious: 100, method: 'GET' } + ] + + const result = analyzeBasicTiming(mockSessionData) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('robotic-timing-detected') + }) + + it('should accept normal timing patterns', () => { + const now = Date.now() + mockSessionData.lastRequests = [ + { timestamp: now - 10000, path: '/page1', timeSincePrevious: 0, method: 'GET' }, + { timestamp: now - 5000, path: '/page2', timeSincePrevious: 5000, method: 'GET' }, + { timestamp: now - 2000, path: '/page3', timeSincePrevious: 3000, method: 'GET' } + ] + + const result = analyzeBasicTiming(mockSessionData) + + expect(result.score).toBe(0) + }) + + it('should handle single request', () => { + mockSessionData.lastRequests = [ + { timestamp: Date.now(), path: '/page1', timeSincePrevious: 0, method: 'GET' } + ] + + const result = analyzeBasicTiming(mockSessionData) + + expect(result.score).toBe(0) + }) + }) + + describe('analyzeBasicRateLimit', () => { + it('should detect high request volume', () => { + // Add many requests + for (let i = 0; i < 50; i++) { + mockSessionData.lastRequests.push({ + timestamp: Date.now() - (i * 1000), + path: `/page${i}`, + timeSincePrevious: 1000, + method: 'GET' + }) + } + + const result = analyzeBasicRateLimit(mockSessionData) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('excessive-requests') + }) + + it('should accept normal request volume', () => { + for (let i = 0; i < 5; i++) { + mockSessionData.lastRequests.push({ + timestamp: Date.now() - (i * 5000), + path: `/page${i}`, + timeSincePrevious: 5000, + method: 'GET' + }) + } + + const result = analyzeBasicRateLimit(mockSessionData) + + expect(result.score).toBe(0) + }) + }) + + describe('analyzePathAccess', () => { + it('should detect sensitive path access', () => { + const context = { + userIntent: 'unknown' as const, + accessPattern: 'systematic' as const, + credibilityScore: 0, + authenticationStatus: 'anonymous' as const, + referrerContext: 'direct' as const, + technicalProfile: { + browserFeatures: [], + networkConsistency: 0.5, + headerCredibility: 0.5 + } + } + + const result = analyzePathAccess('/wp-login.php', context) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('high-risk-path') + }) + + it('should accept normal path access', () => { + const context = { + userIntent: 'browsing' as const, + accessPattern: 'human-like' as const, + credibilityScore: 50, + authenticationStatus: 'anonymous' as const, + referrerContext: 'search-engine' as const, + technicalProfile: { + browserFeatures: [], + networkConsistency: 0.8, + headerCredibility: 0.8 + } + } + + const result = analyzePathAccess('/about', context) + + expect(result.score).toBe(0) + }) + }) + + describe('analyzeHeaderConsistency', () => { + it('should detect inconsistent headers', () => { + const headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', + 'accept-language': 'zh-CN,zh;q=0.9', // Chinese language + 'accept-encoding': 'compress' // Old encoding + } + + const result = analyzeHeaderConsistency(headers) + + expect(result.score).toBeGreaterThan(0) + }) + + it('should accept consistent headers', () => { + const headers = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'accept-language': 'en-US,en;q=0.9', + 'accept-encoding': 'gzip, deflate, br', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + } + + const result = analyzeHeaderConsistency(headers) + + expect(result.score).toBe(0) + }) + }) + + describe('analyzeBurstPattern', () => { + it('should detect burst patterns', () => { + const now = Date.now() + // Create a burst of requests + for (let i = 0; i < 10; i++) { + mockSessionData.lastRequests.push({ + timestamp: now - (200 * i), // 200ms apart + path: `/api/data${i}`, + timeSincePrevious: i === 0 ? 0 : 200, + method: 'GET' + }) + } + + const result = analyzeBurstPattern(mockSessionData) + + expect(result.score).toBeGreaterThan(0) + expect(result.reason).toContain('burst') + }) + + it('should accept normal request patterns', () => { + const now = Date.now() + // Create normal spaced requests + for (let i = 0; i < 5; i++) { + mockSessionData.lastRequests.push({ + timestamp: now - (5000 * i), // 5 seconds apart + path: `/page${i}`, + timeSincePrevious: i === 0 ? 0 : 5000, + method: 'GET' + }) + } + + const result = analyzeBurstPattern(mockSessionData) + + expect(result.score).toBe(0) + }) + }) + + describe('DEFAULT_BEHAVIOR_CONFIG', () => { + it('should have valid configuration structure', () => { + expect(DEFAULT_BEHAVIOR_CONFIG).toHaveProperty('simple') + expect(DEFAULT_BEHAVIOR_CONFIG).toHaveProperty('intermediate') + expect(DEFAULT_BEHAVIOR_CONFIG).toHaveProperty('advanced') + + // Check that all behaviors have enabled and weight properties + const allBehaviors = [ + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.simple), + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.intermediate), + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.advanced) + ] + + allBehaviors.forEach(behavior => { + expect(behavior).toHaveProperty('enabled') + expect(behavior).toHaveProperty('weight') + expect(typeof behavior.enabled).toBe('boolean') + expect(typeof behavior.weight).toBe('number') + }) + }) + + it('should have reasonable weight values', () => { + const allBehaviors = [ + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.simple), + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.intermediate), + ...Object.values(DEFAULT_BEHAVIOR_CONFIG.advanced) + ] + + allBehaviors.forEach(behavior => { + expect(behavior.weight).toBeGreaterThan(0) + expect(behavior.weight).toBeLessThanOrEqual(2) // Reasonable upper bound + }) + }) + }) +}) \ No newline at end of file diff --git a/libs/is-bot/test/core.test.ts b/libs/is-bot/test/core.test.ts new file mode 100644 index 00000000..e482e5d4 --- /dev/null +++ b/libs/is-bot/test/core.test.ts @@ -0,0 +1,313 @@ +import { describe, it, expect, beforeEach } from 'vitest' +import { BotDetectionEngine } from '../src/core' +import { MemoryAdapter } from '../src/adapters/memory' +import { H3SessionIdentifier } from '../src/adapters/h3' +import type { BotDetectionRequest } from '../src/types' + +describe('BotDetectionEngine', () => { + let engine: BotDetectionEngine + let storage: MemoryAdapter + let sessionIdentifier: H3SessionIdentifier + + beforeEach(() => { + storage = new MemoryAdapter() + sessionIdentifier = new H3SessionIdentifier('test-secret') + engine = new BotDetectionEngine({ + storage, + sessionIdentifier, + config: { + thresholds: { + likelyBot: 70, + definitelyBot: 90, + suspicious: 40 + }, + ipFilter: { + trustedIPs: ['127.0.0.1'], + blockedIPs: ['192.168.1.100'] + } + } + }) + }) + + describe('basic detection', () => { + it('should detect legitimate user', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.isBot).toBe(false) + expect(result.score).toBeLessThan(40) + expect(result.recommendation).toBe('allow') + }) + + it('should detect obvious bot user agent', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'curl/7.68.0' + }, + ip: '192.168.1.1', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.isBot).toBe(true) + expect(result.score).toBeGreaterThan(50) + expect(result.factors.some(f => f.type.includes('USER_AGENT'))).toBe(true) + }) + + it('should handle trusted IPs', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'curl/7.68.0' + }, + ip: '127.0.0.1', // Trusted IP + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.isBot).toBe(false) + expect(result.score).toBe(0) + expect(result.recommendation).toBe('allow') + expect(result.factors.some(f => f.reason === 'ip-trusted')).toBe(true) + }) + + it('should handle blocked IPs', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (legitimate browser)' + }, + ip: '192.168.1.100', // Blocked IP + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.isBot).toBe(true) + expect(result.score).toBe(100) + expect(result.recommendation).toBe('block') + expect(result.factors.some(f => f.reason === 'ip-blocked')).toBe(true) + }) + }) + + describe('behavioral analysis', () => { + it('should detect rapid requests', async () => { + const baseRequest: BotDetectionRequest = { + path: '/api/data', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }, + ip: '192.168.1.50', + timestamp: Date.now() + } + + // Make multiple rapid requests + let lastResult + for (let i = 0; i < 15; i++) { + const request = { + ...baseRequest, + timestamp: Date.now() + (i * 100) // 100ms intervals + } + lastResult = await engine.analyze(request) + } + + expect(lastResult!.score).toBeGreaterThan(30) + expect(lastResult!.factors.some(f => f.type.includes('TIMING') || f.type.includes('RATE'))).toBe(true) + }) + + it('should detect sensitive path access', async () => { + const request: BotDetectionRequest = { + path: '/admin/login', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }, + ip: '192.168.1.2', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.score).toBeGreaterThan(10) + }) + + it('should track session behavior over time', async () => { + const sessionId = 'test-session-123' + + // First request should be low score + const request1: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }, + ip: '192.168.1.3', + timestamp: Date.now() + } + + const result1 = await engine.analyze(request1) + expect(result1.score).toBeLessThan(20) + + // Subsequent rapid requests should increase score + for (let i = 0; i < 20; i++) { + const request = { + ...request1, + path: `/page-${i}`, + timestamp: Date.now() + (i * 50) // Very rapid + } + await engine.analyze(request) + } + + // Final request should have higher score + const finalResult = await engine.analyze({ + ...request1, + path: '/final', + timestamp: Date.now() + 1000 + }) + + expect(finalResult.score).toBeGreaterThan(result1.score) + }) + }) + + describe('configuration', () => { + it('should respect custom thresholds', async () => { + engine.updateConfig({ + thresholds: { + likelyBot: 30, // Lower threshold + definitelyBot: 50, + suspicious: 10 + } + }) + + const request: BotDetectionRequest = { + path: '/admin', + method: 'GET', + headers: { + 'user-agent': 'curl/7.68.0' + }, + ip: '192.168.1.4', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + // Should be flagged as bot with lower threshold + expect(result.isBot).toBe(true) + }) + + it('should handle custom sensitive paths', async () => { + engine.updateConfig({ + customSensitivePaths: ['/api/secret', '/private/*'] + }) + + const request: BotDetectionRequest = { + path: '/api/secret', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (legitimate browser)' + }, + ip: '192.168.1.5', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result.score).toBeGreaterThan(0) + }) + }) + + describe('storage integration', () => { + it('should persist session data', async () => { + const request: BotDetectionRequest = { + path: '/test', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (test browser)' + }, + ip: '192.168.1.6', + timestamp: Date.now() + } + + // First request + await engine.analyze(request) + + // Check that session data was stored + const sessionId = await sessionIdentifier.getSessionId(request) + const sessionData = await storage.getSession(sessionId) + + expect(sessionData).toBeTruthy() + expect(sessionData!.lastRequests).toHaveLength(1) + expect(sessionData!.lastRequests[0].path).toBe('/test') + }) + + it('should persist IP data', async () => { + const request: BotDetectionRequest = { + path: '/test', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (test browser)' + }, + ip: '192.168.1.7', + timestamp: Date.now() + } + + await engine.analyze(request) + + const ipData = await storage.getIP(request.ip) + + expect(ipData).toBeTruthy() + expect(ipData!.sessionCount).toBe(1) + expect(ipData!.activeSessions).toHaveLength(1) + }) + }) + + describe('error handling', () => { + it('should handle missing user agent gracefully', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: {}, + ip: '192.168.1.8', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result).toBeTruthy() + expect(result.score).toBeGreaterThan(0) // Missing user agent should increase score + }) + + it('should handle invalid IP addresses', async () => { + const request: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0' + }, + ip: 'invalid-ip', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + expect(result).toBeTruthy() + expect(typeof result.score).toBe('number') + }) + }) +}) \ No newline at end of file diff --git a/libs/is-bot/test/integration.test.ts b/libs/is-bot/test/integration.test.ts new file mode 100644 index 00000000..3394017f --- /dev/null +++ b/libs/is-bot/test/integration.test.ts @@ -0,0 +1,357 @@ +import { describe, it, expect, beforeEach } from 'vitest' +import { BotDetectionEngine } from '../src/core' +import { MemoryAdapter } from '../src/adapters/memory' +import { H3SessionIdentifier } from '../src/adapters/h3' +import type { BotDetectionRequest } from '../src/types' + +describe('Integration Tests', () => { + let engine: BotDetectionEngine + let storage: MemoryAdapter + let sessionIdentifier: H3SessionIdentifier + + beforeEach(() => { + storage = new MemoryAdapter() + sessionIdentifier = new H3SessionIdentifier('test-secret') + engine = new BotDetectionEngine({ + storage, + sessionIdentifier, + config: { + thresholds: { + likelyBot: 70, + definitelyBot: 90, + suspicious: 40 + }, + behaviors: { + simple: { + pathAnalysis: { enabled: true, weight: 1.0 }, + basicTiming: { enabled: true, weight: 1.0 }, + basicRateLimit: { enabled: true, weight: 1.0 }, + basicUserAgent: { enabled: true, weight: 1.0 }, + simplePatterns: { enabled: true, weight: 1.0 }, + basicPositiveSignals: { enabled: true, weight: 1.0 } + }, + intermediate: { + burstDetection: { enabled: true, weight: 1.0 }, + headerConsistency: { enabled: true, weight: 1.0 }, + contextualRateLimit: { enabled: true, weight: 1.0 } + }, + advanced: { + advancedTiming: { enabled: false, weight: 1.0 }, + advancedIntent: { enabled: false, weight: 1.0 }, + browserFingerprint: { enabled: false, weight: 1.0 }, + advancedPositiveSignals: { enabled: false, weight: 1.0 }, + behavioralCredibility: { enabled: false, weight: 1.0 } + } + } + } + }) + }) + + describe('Realistic Bot Scenarios', () => { + it('should detect web scraper bot', async () => { + const baseRequest: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Python/3.9 requests/2.25.1' + }, + ip: '203.0.113.1', + timestamp: Date.now() + } + + // Scraper typically accesses many pages rapidly + const pages = [ + '/products', + '/products/page/2', + '/products/page/3', + '/api/products', + '/search?q=laptop', + '/search?q=phone', + '/sitemap.xml' + ] + + let lastResult + for (let i = 0; i < pages.length; i++) { + const request = { + ...baseRequest, + path: pages[i], + timestamp: Date.now() + (i * 500) // 500ms between requests + } + lastResult = await engine.analyze(request) + } + + expect(lastResult!.isBot).toBe(true) + expect(lastResult!.score).toBeGreaterThan(70) + expect(lastResult!.factors.some(f => f.type.includes('USER_AGENT'))).toBe(true) + }) + + it('should detect aggressive crawler', async () => { + const baseRequest: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'curl/7.68.0' + }, + ip: '198.51.100.1', + timestamp: Date.now() + } + + // Aggressive crawler hits many endpoints very quickly + const endpoints = [ + '/', + '/about', + '/contact', + '/admin', + '/wp-admin', + '/api/users', + '/api/posts', + '/.env', + '/config.php', + '/xmlrpc.php' + ] + + let lastResult + for (let i = 0; i < endpoints.length; i++) { + const request = { + ...baseRequest, + path: endpoints[i], + timestamp: Date.now() + (i * 100) // Very rapid - 100ms between requests + } + lastResult = await engine.analyze(request) + } + + expect(lastResult!.isBot).toBe(true) + expect(lastResult!.score).toBeGreaterThan(80) + expect(lastResult!.recommendation).toBe('block') + }) + + it('should allow legitimate user browsing', async () => { + const baseRequest: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'accept-language': 'en-US,en;q=0.5', + 'accept-encoding': 'gzip, deflate, br', + 'referer': 'https://google.com/' + }, + ip: '192.0.2.1', + timestamp: Date.now() + } + + // Normal user browsing pattern + const userJourney = [ + { path: '/', delay: 0 }, + { path: '/products', delay: 3000 }, + { path: '/products/laptop', delay: 5000 }, + { path: '/products/laptop?color=black', delay: 2000 }, + { path: '/cart', delay: 8000 }, + { path: '/checkout', delay: 4000 } + ] + + let lastResult + let currentTime = Date.now() + + for (const step of userJourney) { + currentTime += step.delay + const request = { + ...baseRequest, + path: step.path, + timestamp: currentTime + } + lastResult = await engine.analyze(request) + } + + expect(lastResult!.isBot).toBe(false) + expect(lastResult!.score).toBeLessThan(40) + expect(lastResult!.recommendation).toBe('allow') + }) + + it('should handle mixed legitimate and suspicious behavior', async () => { + const baseRequest: BotDetectionRequest = { + path: '/', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }, + ip: '198.51.100.50', + timestamp: Date.now() + } + + // Start with normal browsing + const normalRequests = [ + { path: '/', delay: 0 }, + { path: '/about', delay: 3000 }, + { path: '/contact', delay: 2000 } + ] + + let currentTime = Date.now() + for (const step of normalRequests) { + currentTime += step.delay + await engine.analyze({ + ...baseRequest, + path: step.path, + timestamp: currentTime + }) + } + + // Then suddenly start rapid suspicious requests + const suspiciousRequests = [ + '/admin', + '/wp-admin', + '/api/users', + '/.env', + '/config.php' + ] + + let lastResult + for (let i = 0; i < suspiciousRequests.length; i++) { + currentTime += 200 // Very rapid + const request = { + ...baseRequest, + path: suspiciousRequests[i], + timestamp: currentTime + } + lastResult = await engine.analyze(request) + } + + // Should detect the suspicious pattern + expect(lastResult!.score).toBeGreaterThan(30) + expect(lastResult!.factors.some(f => + f.type.includes('PATH') || + f.type.includes('TIMING') || + f.type.includes('RATE') + )).toBe(true) + }) + }) + + describe('Performance and Scalability', () => { + it('should handle high volume of requests efficiently', async () => { + const startTime = Date.now() + const numRequests = 100 + + for (let i = 0; i < numRequests; i++) { + const request: BotDetectionRequest = { + path: `/page-${i}`, + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (test browser)' + }, + ip: `192.168.1.${i % 50}`, // Rotate through 50 IPs + timestamp: Date.now() + (i * 100) + } + + await engine.analyze(request) + } + + const endTime = Date.now() + const avgTimePerRequest = (endTime - startTime) / numRequests + + // Should process requests quickly (less than 10ms per request on average) + expect(avgTimePerRequest).toBeLessThan(10) + }) + + it('should maintain session state across multiple requests', async () => { + const ip = '192.168.1.100' + const userAgent = 'Mozilla/5.0 (consistent browser)' + + // Make several requests from the same session + for (let i = 0; i < 5; i++) { + const request: BotDetectionRequest = { + path: `/step-${i}`, + method: 'GET', + headers: { + 'user-agent': userAgent + }, + ip, + timestamp: Date.now() + (i * 1000) + } + + await engine.analyze(request) + } + + // Check that session data accumulated + const sessionId = await sessionIdentifier.getSessionId({ + path: '/test', + method: 'GET', + headers: { 'user-agent': userAgent }, + ip, + timestamp: Date.now() + }) + + const sessionData = await storage.getSession(sessionId) + expect(sessionData).toBeTruthy() + expect(sessionData!.lastRequests.length).toBe(5) + + // Check that IP data accumulated + const ipData = await storage.getIP(ip) + expect(ipData).toBeTruthy() + expect(ipData!.sessionCount).toBe(1) + expect(ipData!.activeSessions).toHaveLength(1) + }) + }) + + describe('Configuration Flexibility', () => { + it('should adapt to different threshold configurations', async () => { + // Create engine with very strict thresholds + const strictEngine = new BotDetectionEngine({ + storage: new MemoryAdapter(), + sessionIdentifier: new H3SessionIdentifier('strict-test'), + config: { + thresholds: { + likelyBot: 20, // Very low threshold + definitelyBot: 40, + suspicious: 10 + } + } + }) + + const mildlyBotRequest: BotDetectionRequest = { + path: '/admin', + method: 'GET', + headers: { + 'user-agent': 'Mozilla/5.0 (short)' + }, + ip: '192.168.1.200', + timestamp: Date.now() + } + + const result = await strictEngine.analyze(mildlyBotRequest) + + // Should be more likely to flag as bot with strict thresholds + expect(result.isBot).toBe(true) + }) + + it('should handle behavior configuration changes', async () => { + // Disable most behaviors + engine.updateBehaviorConfig({ + simple: { + pathAnalysis: { enabled: false, weight: 1.0 }, + basicTiming: { enabled: false, weight: 1.0 }, + basicRateLimit: { enabled: false, weight: 1.0 }, + basicUserAgent: { enabled: true, weight: 1.0 }, // Keep only user agent + simplePatterns: { enabled: false, weight: 1.0 }, + basicPositiveSignals: { enabled: false, weight: 1.0 } + } + }) + + const request: BotDetectionRequest = { + path: '/admin', // Would normally trigger path analysis + method: 'GET', + headers: { + 'user-agent': 'curl/7.68.0' // Should still trigger user agent detection + }, + ip: '192.168.1.201', + timestamp: Date.now() + } + + const result = await engine.analyze(request) + + // Should still detect bot based on user agent only + expect(result.isBot).toBe(true) + expect(result.factors.some(f => f.type.includes('USER_AGENT'))).toBe(true) + expect(result.factors.some(f => f.type.includes('PATH'))).toBe(false) + }) + }) +}) \ No newline at end of file diff --git a/libs/is-bot/test/setup.test.ts b/libs/is-bot/test/setup.test.ts new file mode 100644 index 00000000..08cd2f1b --- /dev/null +++ b/libs/is-bot/test/setup.test.ts @@ -0,0 +1,40 @@ +import { describe, it, expect } from 'vitest' + +describe('Library Setup', () => { + it('should be able to import core components', async () => { + const { BotDetectionEngine } = await import('../src/core') + const { MemoryAdapter } = await import('../src/adapters/memory') + const { H3SessionIdentifier } = await import('../src/adapters/h3') + + expect(BotDetectionEngine).toBeDefined() + expect(MemoryAdapter).toBeDefined() + expect(H3SessionIdentifier).toBeDefined() + }) + + it('should be able to import from index', async () => { + const module = await import('../src/index') + + expect(module.BotDetectionEngine).toBeDefined() + expect(module.MemoryAdapter).toBeDefined() + expect(module.UnstorageAdapter).toBeDefined() + expect(module.UnstorageBehaviorAdapter).toBeDefined() + expect(module.H3SessionIdentifier).toBeDefined() + expect(module.isBotUserAgent).toBeDefined() + expect(module.isSensitivePath).toBeDefined() + expect(module.isValidUserAgent).toBeDefined() + }) + + it('should export utility functions', async () => { + const { isBotUserAgent, isSensitivePath, isValidUserAgent } = await import('../src/index') + + // Test utility functions + expect(isBotUserAgent('curl/7.68.0')).toBe(true) + expect(isBotUserAgent('Mozilla/5.0 (legitimate browser)')).toBe(false) + + expect(isSensitivePath('/wp-admin')).toBe(true) + expect(isSensitivePath('/about')).toBe(false) + + expect(isValidUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')).toBe(true) + expect(isValidUserAgent('curl')).toBe(false) + }) +}) \ No newline at end of file diff --git a/libs/is-bot/tsconfig.json b/libs/is-bot/tsconfig.json new file mode 100644 index 00000000..297bc717 --- /dev/null +++ b/libs/is-bot/tsconfig.json @@ -0,0 +1,33 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "preserve", + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "skipLibCheck": true, + "esModuleInterop": true, + "allowSyntheticDefaultImports": true, + "forceConsistentCasingInFileNames": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "outDir": "./dist", + "rootDir": "./src" + }, + "include": [ + "src/**/*", + "test/**/*" + ], + "exclude": [ + "node_modules", + "dist" + ] +} \ No newline at end of file diff --git a/libs/is-bot/tsup.config.ts b/libs/is-bot/tsup.config.ts new file mode 100644 index 00000000..897927ca --- /dev/null +++ b/libs/is-bot/tsup.config.ts @@ -0,0 +1,16 @@ +import { defineConfig } from 'tsup' + +export default defineConfig({ + entry: [ + 'src/index.ts', + 'src/drivers/h3.ts', + 'src/behaviors/index.ts' + ], + format: ['esm', 'cjs'], + dts: true, + clean: true, + splitting: false, + sourcemap: true, + minify: false, + external: ['h3', 'unstorage'] +}) \ No newline at end of file diff --git a/libs/is-bot/vitest.config.ts b/libs/is-bot/vitest.config.ts new file mode 100644 index 00000000..496b0cc7 --- /dev/null +++ b/libs/is-bot/vitest.config.ts @@ -0,0 +1,19 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + environment: 'node', + globals: true, + coverage: { + provider: 'v8', + reporter: ['text', 'json', 'html'], + exclude: [ + 'node_modules/**', + 'dist/**', + 'test/**', + '**/*.d.ts', + '**/*.config.*' + ] + } + } +}) \ No newline at end of file diff --git a/src/module.ts b/src/module.ts index 3e258a02..c3933406 100644 --- a/src/module.ts +++ b/src/module.ts @@ -158,12 +158,85 @@ export interface ModuleOptions { */ credits: boolean /** - * Enable bot detection plugin. - * When disabled, no bot detection is performed. - * + * Bot detection and behavioral analysis configuration + * @default false (can be boolean for simple enable/disable or object for advanced config) + */ + botDetection?: boolean | BotDetectionConfig +} + +export interface BotDetectionConfig { + /** + * Whether bot detection is enabled * @default true */ - botDetection?: boolean + enabled?: boolean + /** + * Session configuration + */ + session?: { + /** + * Session encryption password (auto-generated if not provided) + */ + password?: string + /** + * Session TTL in milliseconds + * @default 86400000 (24 hours) + */ + ttl?: number + /** + * Maximum sessions per IP + * @default 10 + */ + maxSessionsPerIP?: number + } + /** + * Detection thresholds + */ + thresholds?: { + /** + * Score threshold for definite bot classification + * @default 90 + */ + definitelyBot?: number + /** + * Score threshold for likely bot classification + * @default 70 + */ + likelyBot?: number + /** + * Score threshold for suspicious behavior + * @default 40 + */ + suspicious?: number + } + /** + * Custom sensitive paths to monitor + */ + customSensitivePaths?: string[] + /** + * IP allowlist/blocklist configuration + */ + ipFilter?: { + /** + * Trusted IP addresses (always allowed) + * @default ['127.0.0.1', '::1'] + */ + trustedIPs?: string[] + /** + * Permanently blocked IP addresses + * @default [] + */ + blockedIPs?: string[] + } + /** + * Whether to enable detailed debug information + * @default false + */ + debug?: boolean +} + +export interface ModuleRuntimeHooks { + 'robots:bot-detected': () => Promise | void } export interface ResolvedModuleOptions extends ModuleOptions { @@ -257,6 +330,8 @@ export default defineNuxtModule({ if (config.metaTag) addPlugin({ mode: 'server', src: resolve('./runtime/app/plugins/robot-meta.server') }) + addPlugin({ src: resolve('./runtime/app/plugins/botd') }) + if (config.robotsTxt && config.mergeWithRobotsTxtPath !== false) { let usingRobotsTxtPath = '' let robotsTxt: boolean | string = false @@ -468,7 +543,7 @@ export default defineNuxtModule({ robotsEnabledValue: config.robotsEnabledValue, robotsDisabledValue: config.robotsDisabledValue, cacheControl: config.cacheControl ?? 'max-age=14400, must-revalidate', - botDetection: config.botDetection ?? true, + botDetection: typeof config.botDetection === 'object' ? true : (config.botDetection ?? true), } nuxt.options.runtimeConfig['nuxt-robots'] = robotsRuntimeConfig as any }) @@ -570,6 +645,7 @@ export {} handler: resolve('./runtime/server/middleware/injectContext'), }) addServerPlugin(resolve('./runtime/server/plugins/initContext')) + addServerPlugin(resolve('./runtime/server/plugins/botDetection')) if (isNuxtContentV2) { addServerHandler({ @@ -578,6 +654,18 @@ export {} }) } + addServerHandler({ + route: '/__robots__/beacon', + handler: resolve('./runtime/server/routes/__robots__/beacon'), + }) + addServerHandler({ + route: '/__robots__/flag', + handler: resolve('./runtime/server/routes/__robots__/flag'), + }) + addServerHandler({ + route: '/__robots__/debug-bot-detection', + handler: resolve('./runtime/server/routes/__robots__/debug-bot-detection'), + }) if (config.debug || nuxt.options.dev) { addServerHandler({ route: '/__robots__/debug.json', diff --git a/src/runtime/server/lib/storage.ts b/src/runtime/server/lib/storage.ts new file mode 100644 index 00000000..055ec47f --- /dev/null +++ b/src/runtime/server/lib/storage.ts @@ -0,0 +1,300 @@ +import type { H3Event } from 'h3' +import type { BotDetectionBehavior, IPData, SessionData } from './behavior' +import { getRequestIP, useSession } from 'h3' +import { useRuntimeConfig, useStorage } from 'nitropack/runtime' + +import { TrafficType } from './behavior' + +// Performance optimization: Batch storage updates +const pendingUpdates = new Map() +let SESSION_TTL = 24 * 60 * 60 * 1000 // 24 hours +let MAX_SESSIONS_PER_IP = 10 +const BATCH_FLUSH_INTERVAL = 30000 // 30 seconds +const MAX_PENDING_UPDATES = 100 + +// IP allowlist/blocklist for enhanced security +const TRUSTED_IPS = new Set(['127.0.0.1', '::1', '10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16']) +const BLOCKED_IPS = new Set() +const TEMP_BLOCKED_IPS = new Map() // IP -> unblock timestamp + +// Configuration defaults +let configInitialized = false +function initializeConfig() { + if (configInitialized) + return + + try { + const config = useRuntimeConfig() + const botConfig = (config as any)?.robots?.botDetection + + if (botConfig && typeof botConfig === 'object') { + // Update session config + if (botConfig.session?.ttl) { + SESSION_TTL = botConfig.session.ttl + } + if (botConfig.session?.maxSessionsPerIP) { + MAX_SESSIONS_PER_IP = botConfig.session.maxSessionsPerIP + } + + // Update IP filter config + if (botConfig.ipFilter?.trustedIPs) { + botConfig.ipFilter.trustedIPs.forEach((ip: string) => TRUSTED_IPS.add(ip)) + } + if (botConfig.ipFilter?.blockedIPs) { + botConfig.ipFilter.blockedIPs.forEach((ip: string) => BLOCKED_IPS.add(ip)) + } + } + + configInitialized = true + } + catch { + // Fallback to defaults if config is not available + configInitialized = true + } +} + +// IP utility functions +function isIPTrusted(ip: string): boolean { + return TRUSTED_IPS.has(ip) || ip.startsWith('127.') || ip.startsWith('::1') +} + +function isIPBlocked(ip: string): boolean { + if (BLOCKED_IPS.has(ip)) + return true + + const tempBlockExpiry = TEMP_BLOCKED_IPS.get(ip) + if (tempBlockExpiry && Date.now() < tempBlockExpiry) { + return true + } + else if (tempBlockExpiry) { + // Temp block expired, remove it + TEMP_BLOCKED_IPS.delete(ip) + } + return false +} + +function blockIPTemporarily(ip: string, durationMs: number = 60 * 60 * 1000) { // 1 hour default + TEMP_BLOCKED_IPS.set(ip, Date.now() + durationMs) +} + +// Session cleanup utilities +function cleanupOldSessions(ipData: IPData): IPData { + const now = Date.now() + const validSessions = ipData.activeSessions.filter((_sessionId) => { + // Keep sessions that are recent enough + return (now - ipData.lastUpdated) < SESSION_TTL + }) + + // Limit sessions per IP + if (validSessions.length > MAX_SESSIONS_PER_IP) { + ipData.activeSessions = validSessions.slice(-MAX_SESSIONS_PER_IP) + } + else { + ipData.activeSessions = validSessions + } + + ipData.sessionCount = ipData.activeSessions.length + return ipData +} + +// Batch storage operations for performance +async function flushPendingUpdates() { + if (pendingUpdates.size === 0) + return + + const storage = useStorage('cache:robots:bot-detection') + const updates = Array.from(pendingUpdates.entries()) + pendingUpdates.clear() + + // Batch write all pending updates + await Promise.all(updates.map(async ([key, behavior]) => { + const sessionKey = `session:${behavior.id}` + const ipKey = `ip:${key.split(':')[1]}` // Extract IP from composite key + + return Promise.all([ + storage.setItem(sessionKey, behavior.session), + storage.setItem(ipKey, cleanupOldSessions(behavior.ip)), + ]) + })) +} + +// Auto-flush pending updates +setInterval(flushPendingUpdates, BATCH_FLUSH_INTERVAL) + +// Flush when too many pending updates +function maybeForcedFlush() { + if (pendingUpdates.size >= MAX_PENDING_UPDATES) { + return flushPendingUpdates() + } +} + +export async function initBotDetectionSession(event: H3Event) { + initializeConfig() + + const ip = getRequestIP(event, { xForwardedFor: true }) + + // Get session password from config or generate default + let sessionPassword = '80d42cfb-1cd2-462c-8f17-e3237d9027e9' // fallback + try { + const config = useRuntimeConfig() + const botConfig = (config as any)?.robots?.botDetection + if (botConfig && typeof botConfig === 'object' && botConfig.session?.password) { + sessionPassword = botConfig.session.password + } + } + catch { + // Use fallback password + } + + const session = await useSession(event, { + password: sessionPassword, + }) + // fetch sdession data + const sessionKey = `session:${session.id}` + const ipKey = `ip:${ip}` + // TODO runtimeConfig support + return { + sessionKey, + ipKey, + session, + storage: useStorage('cache:robots:bot-detection'), + } +} + +export async function getBotDetectionBehavior(e: H3Event): Promise { + const now = Date.now() + const { ipKey, session, storage, sessionKey } = await initBotDetectionSession(e) + const ip = getRequestIP(e, { xForwardedFor: true }) + + // Check IP allowlist/blocklist first + if (ip && isIPTrusted(ip)) { + return { + id: session.id, + session: { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + lastScore: 0, + score: 0, + lastUpdated: now, + trafficType: TrafficType.REGULAR_USER, + knownGoodActions: 10, // Trusted IPs get good score + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: now, + }, + ip: { + sessionCount: 1, + activeSessions: [session.id], + suspiciousScore: 0, + lastUpdated: now, + legitSessionsCount: 1, + isBot: false, + isBotConfidence: 0, + lastSessionCreated: now, + factores: ['trusted-ip'], + details: { name: 'trusted', type: 'trusted', trusted: true }, + }, + trusted: true, + } as BotDetectionBehavior + } + + if (ip && isIPBlocked(ip)) { + return { + id: session.id, + session: { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + lastScore: 100, + score: 100, + lastUpdated: now, + trafficType: TrafficType.MALICIOUS_BOT, + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: now, + }, + ip: { + sessionCount: 1, + activeSessions: [session.id], + suspiciousScore: 100, + lastUpdated: now, + legitSessionsCount: 0, + isBot: true, + isBotConfidence: 100, + lastSessionCreated: now, + factores: ['blocked-ip'], + details: { name: 'blocked', type: 'blocked', trusted: false }, + }, + blocked: true, + } as BotDetectionBehavior + } + + const sessionData = storage.getItem(sessionKey) + const ipData = storage.getItem(ipKey) + return Promise.all([ + sessionData, + ipData, + ]).then(([sessionData, ip]) => { + return { + id: session.id, + session: sessionData || { + lastRequests: [], + suspiciousPathHits: 0, + maybeSensitivePathHits: 0, + uniqueSensitivePathsAccessed: [], + errorCount: 0, + lastScore: 0, + score: 0, + lastUpdated: now, + trafficType: TrafficType.UNKNOWN, + knownGoodActions: 0, + requestMethodVariety: [], + requestSequenceEntropy: 0, + firstSeenAt: now, + }, + ip: cleanupOldSessions(ip || { + sessionCount: 0, + activeSessions: [], + suspiciousScore: 0, + lastUpdated: now, + legitSessionsCount: 0, + isBot: false, + isBotConfidence: 0, + lastSessionCreated: now, + factores: [], + }), + } satisfies BotDetectionBehavior + }) +} + +export async function updateBotSessionBehavior(e: H3Event, behavior: BotDetectionBehavior) { + const ip = getRequestIP(e, { xForwardedFor: true }) + const compositeKey = `${behavior.id}:${ip}` + + // Add to pending updates for batch processing + pendingUpdates.set(compositeKey, behavior) + + // If malicious behavior detected, block IP temporarily + if (ip && behavior.ip.isBot && (behavior.ip.isBotConfidence || 0) > 80) { + blockIPTemporarily(ip, 60 * 60 * 1000) // 1 hour block + } + + // Force flush if too many pending updates + await maybeForcedFlush() +} + +// Export utilities for external use +export { + BLOCKED_IPS, + blockIPTemporarily, + flushPendingUpdates, + isIPBlocked, + isIPTrusted, + TRUSTED_IPS, +} diff --git a/src/runtime/server/plugins/botDetection.ts b/src/runtime/server/plugins/botDetection.ts new file mode 100644 index 00000000..c6ba272e --- /dev/null +++ b/src/runtime/server/plugins/botDetection.ts @@ -0,0 +1,117 @@ +import { getHeaders } from 'h3' +import { defineNitroPlugin, useRuntimeConfig, useStorage } from 'nitropack/runtime' +import { isBotFromHeaders } from '../../../util' +import { + BotDetectionEngine, + H3SessionIdentifier, + H3ResponseStatusProvider, + h3ToBotDetectionRequest, + createTrackedBotDetectionRequest +} from '../../../libs/is-bot/src' +import { UnstorageBehaviorAdapter } from '../../../libs/is-bot/src/adapters/behavior-storage' + +// Global instances for the plugin +let botDetectionEngine: BotDetectionEngine | null = null +let responseStatusProvider: H3ResponseStatusProvider | null = null + +function initializeBotDetection() { + if (botDetectionEngine) return botDetectionEngine + + const config = useRuntimeConfig() + const botConfig = (config as any)?.robots?.botDetection || {} + + // Create storage adapter using Nitro's useStorage + const storage = useStorage('cache:robots:bot-detection') + const storageAdapter = new UnstorageBehaviorAdapter(storage, { + sessionTTL: botConfig.session?.ttl || 24 * 60 * 60 * 1000, + ipTTL: botConfig.ip?.ttl || 7 * 24 * 60 * 60 * 1000, + siteProfileTTL: botConfig.siteProfile?.ttl || 30 * 24 * 60 * 60 * 1000 + }) + + // Create session identifier + const sessionIdentifier = new H3SessionIdentifier( + botConfig.session?.password || '80d42cfb-1cd2-462c-8f17-e3237d9027e9' + ) + + // Create response status provider + responseStatusProvider = new H3ResponseStatusProvider() + + // Create bot detection engine + botDetectionEngine = new BotDetectionEngine({ + storage: storageAdapter, + sessionIdentifier, + responseStatusProvider, + config: { + session: { + ttl: botConfig.session?.ttl || 24 * 60 * 60 * 1000, + maxSessionsPerIP: botConfig.session?.maxSessionsPerIP || 10 + }, + thresholds: { + definitelyBot: botConfig.thresholds?.definitelyBot || 90, + likelyBot: botConfig.thresholds?.likelyBot || 70, + suspicious: botConfig.thresholds?.suspicious || 40 + }, + ipFilter: { + trustedIPs: [...(botConfig.ipFilter?.trustedIPs || []), '127.0.0.1', '::1'], + blockedIPs: botConfig.ipFilter?.blockedIPs || [] + }, + debug: botConfig.debug || false, + customSensitivePaths: botConfig.customSensitivePaths || [], + behaviors: botConfig.behaviors + } + }) + + return botDetectionEngine +} + +export default defineNitroPlugin((nitroApp) => { + nitroApp.hooks.hook('request', async (event) => { + try { + const engine = initializeBotDetection() + + // Quick user-agent check first (legacy compatibility) + const { isBot, data } = isBotFromHeaders(getHeaders(event)) + if (isBot) { + event.context.botDetection = { + isBot: true, + confidence: 100, + score: 100, + factors: [{ type: 'USER_AGENT', score: 100, reason: 'Bot detected via user agent' }], + recommendation: 'block', + sessionId: 'ua-bot', + legacy: true, + details: data ? { + name: data.botName, + type: data.botCategory, + trusted: data.trusted + } : null + } + return + } + + // Use framework-agnostic bot detection engine + const request = createTrackedBotDetectionRequest(event, responseStatusProvider!) + const result = await engine.analyze(request, event) + + event.context.botDetection = result + + } catch (error) { + // Fallback to legacy detection on error + const { isBot, data } = isBotFromHeaders(getHeaders(event)) + event.context.botDetection = { + isBot, + confidence: isBot ? 100 : 0, + score: isBot ? 100 : 0, + factors: isBot ? [{ type: 'USER_AGENT', score: 100, reason: 'Bot detected via user agent (fallback)' }] : [], + recommendation: isBot ? 'block' : 'allow', + sessionId: 'fallback', + error: error instanceof Error ? error.message : 'Unknown error' + } + } + }) + + nitroApp.hooks.hook('afterResponse', async (event) => { + // The engine handles storage updates internally during analysis + // No additional storage operations needed here + }) +}) diff --git a/src/runtime/server/routes/__robots__/beacon.ts b/src/runtime/server/routes/__robots__/beacon.ts new file mode 100644 index 00000000..3efce8a2 --- /dev/null +++ b/src/runtime/server/routes/__robots__/beacon.ts @@ -0,0 +1,9 @@ +import { defineEventHandler } from 'h3' + +/** + * This route is called using a beacon request from the client when the page was loaded as prerendered HTML. Since it + * didn't pass through the server we need to initialize the session via this request. + * + * This is handled by the botDetection.ts plugin. + */ +export default defineEventHandler(() => 'OK') diff --git a/src/runtime/server/routes/__robots__/debug-bot-detection.ts b/src/runtime/server/routes/__robots__/debug-bot-detection.ts new file mode 100644 index 00000000..51c38ebc --- /dev/null +++ b/src/runtime/server/routes/__robots__/debug-bot-detection.ts @@ -0,0 +1,67 @@ +import { defineEventHandler, getQuery } from 'h3' +import { analyzeSessionAndIpBehavior } from '../../../libs/is-bot/src/behavior' +import { getBotDetectionBehavior } from '../../../libs/is-bot/src/storage' + +/** + * Debug endpoint for bot detection analysis + * Returns detailed information about the current session's bot detection status + */ +export default defineEventHandler(async (event) => { + try { + const query = getQuery(event) + const detailed = query.detailed === 'true' + + // Get current behavior analysis + const behavior = await getBotDetectionBehavior(event) + + // Run analysis with debug enabled to get detailed info + analyzeSessionAndIpBehavior({ + event, + behavior, + debug: true, + }) + + const response = { + timestamp: Date.now(), + session: { + id: behavior.id, + isBot: behavior.ip.isBot, + confidence: behavior.ip.isBotConfidence, + score: behavior.session.score, + trafficType: behavior.session.trafficType, + requestCount: behavior.session.lastRequests.length, + sessionAge: Date.now() - behavior.session.firstSeenAt, + }, + ip: { + sessionCount: behavior.ip.sessionCount, + suspiciousScore: behavior.ip.suspiciousScore, + legitSessionsCount: behavior.ip.legitSessionsCount, + factors: behavior.ip.factores, + details: behavior.ip.details, + }, + debug: behavior.debug, + } + + // Include detailed information if requested + if (detailed) { + return { + ...response, + rawBehavior: behavior, + constants: { + BOT_SCORE_THRESHOLDS: await import('../../../libs/is-bot/src/behavior').then(m => m.BOT_SCORE_THRESHOLDS), + BEHAVIOR_WEIGHTS: await import('../../../libs/is-bot/src/behavior').then(m => m.BEHAVIOR_WEIGHTS), + SENSITIVE_PATHS: await import('../../../libs/is-bot/src/behavior').then(m => m.SENSITIVE_PATHS), + }, + } + } + + return response + } + catch (error) { + return { + error: 'Bot detection debug failed', + message: error instanceof Error ? error.message : 'Unknown error', + timestamp: Date.now(), + } + } +}) diff --git a/src/runtime/server/routes/__robots__/flag.ts b/src/runtime/server/routes/__robots__/flag.ts new file mode 100644 index 00000000..007e9856 --- /dev/null +++ b/src/runtime/server/routes/__robots__/flag.ts @@ -0,0 +1,13 @@ +import { defineEventHandler } from 'h3' +import { getBotDetectionBehavior, updateBotSessionBehavior } from '../../../libs/is-bot/src/storage' + +/** + * This route is called using a beacon request from the client, telling the server that the client has failed the + * bot detection test. + */ +export default defineEventHandler(async (e) => { + const behavior = await getBotDetectionBehavior(e) + behavior.ip.isBot = true + behavior.ip.factores.push('botd.js') + await updateBotSessionBehavior(e, behavior) +}) diff --git a/src/runtime/types.ts b/src/runtime/types.ts index 8d6ab1bf..ac0c6290 100644 --- a/src/runtime/types.ts +++ b/src/runtime/types.ts @@ -104,6 +104,23 @@ export interface AutoI18nConfig { strategy: 'prefix' | 'prefix_except_default' | 'prefix_and_default' | 'no_prefix' } +export interface BotScoreData { + total: number + count: number + average: number +} + +export interface DailyBotStats { + [date: string]: { + count: number + bots: Record + scores?: Record + hourly?: number[] // 24 elements array for hourly distribution + paths?: Record> // Bot type -> path -> count + sources?: Record // Detection source -> count + } +} + export interface RobotsContext { rule: string indexable: boolean diff --git a/src/util.ts b/src/util.ts index 9a5cc7bf..3aca45ad 100644 --- a/src/util.ts +++ b/src/util.ts @@ -432,4 +432,3 @@ export function getBotInfo( method: detection.detectionMethod, } } - diff --git a/test/unit/botBehavior.test.ts b/test/unit/botBehavior.test.ts new file mode 100644 index 00000000..06311f4a --- /dev/null +++ b/test/unit/botBehavior.test.ts @@ -0,0 +1,42 @@ +import { describe, expect, it } from 'vitest' + +// Import constants directly to avoid runtime dependencies +const SENSITIVE_PATHS = [ + '/wp-login', + '/xmlrpc.php', + '/.env', + '/phpmyadmin', +] + +const BEHAVIOR_WEIGHTS = { + SENSITIVE_PATH: 15, + RAPID_REQUESTS: 20, + MULTIPLE_SENSITIVE_HITS: 40, +} + +const BOT_SCORE_THRESHOLDS = { + DEFINITELY_BOT: 90, + LIKELY_BOT: 70, + SUSPICIOUS: 40, +} + +describe('bot behavior constants', () => { + it('defines sensitive paths for bot detection', () => { + expect(SENSITIVE_PATHS).toContain('/wp-login') + expect(SENSITIVE_PATHS).toContain('/.env') + expect(SENSITIVE_PATHS).toContain('/phpmyadmin') + expect(SENSITIVE_PATHS.length).toBeGreaterThan(3) + }) + + it('defines behavior weights for scoring', () => { + expect(BEHAVIOR_WEIGHTS.SENSITIVE_PATH).toBe(15) + expect(BEHAVIOR_WEIGHTS.RAPID_REQUESTS).toBe(20) + expect(BEHAVIOR_WEIGHTS.MULTIPLE_SENSITIVE_HITS).toBe(40) + }) + + it('defines bot score thresholds', () => { + expect(BOT_SCORE_THRESHOLDS.DEFINITELY_BOT).toBe(90) + expect(BOT_SCORE_THRESHOLDS.LIKELY_BOT).toBe(70) + expect(BOT_SCORE_THRESHOLDS.SUSPICIOUS).toBe(40) + }) +})