diff --git a/.env.example b/.env.example index e0a39d7..ab8e3ff 100644 --- a/.env.example +++ b/.env.example @@ -51,7 +51,6 @@ QDRANT_ADDR=qdrant:6334 CLICKHOUSE_ADDR=clickhouse:9000 EMBEDDER_ADDR=embedder:50051 INTELLIGENCE_URL=http://intelligence:5000 -JAEGER_ADDR=jaeger:4317 # --- Optional --- diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba504fc..e1a587e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: Hyperion CI on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] jobs: gateway: @@ -12,12 +12,19 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.22' + + - name: Generate test secrets + run: | + echo "CACHE_MASTER_SECRET=$(openssl rand -hex 32)" >> "$GITHUB_ENV" + - name: Build Gateway run: cd gateway && go build -v ./cmd/server/main.go + - name: Run Gateway Tests run: cd gateway && go test -v ./... @@ -26,12 +33,15 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Set up Node uses: actions/setup-node@v4 with: node-version: '18' + - name: Install Dependencies run: cd dashboard && npm install + - name: Build Dashboard run: cd dashboard && npm run build @@ -40,14 +50,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.10' + - name: Install Intelligence Deps run: | cd predictor pip install -r requirements.txt + - name: Install Embedder Deps run: | cd embedder diff --git a/.github/workflows_disabled/ci.yml b/.github/workflows_disabled/ci.yml new file mode 100644 index 0000000..ba504fc --- /dev/null +++ b/.github/workflows_disabled/ci.yml @@ -0,0 +1,54 @@ +name: Hyperion CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + gateway: + name: Gateway CI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + - name: Build Gateway + run: cd gateway && go build -v ./cmd/server/main.go + - name: Run Gateway Tests + run: cd gateway && go test -v ./... + + dashboard: + name: Dashboard CI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: '18' + - name: Install Dependencies + run: cd dashboard && npm install + - name: Build Dashboard + run: cd dashboard && npm run build + + python-microservices: + name: Python CI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install Intelligence Deps + run: | + cd predictor + pip install -r requirements.txt + - name: Install Embedder Deps + run: | + cd embedder + pip install -r requirements.txt diff --git a/.gitignore b/.gitignore index 6368a6c..41f0661 100644 --- a/.gitignore +++ b/.gitignore @@ -9,10 +9,18 @@ Thumbs.db *.swo # Go gateway/bin/ -*.exe -*.test -*.out -go.work +gateway/gateway-server +gateway/benchmark +gateway/cbenchmark +gateway.pid +*.go_snippet + gateway/*_snippet + *.exe + *.test + *.out + go.work + repro_*.py + repro_*.txt # Python __pycache__/ @@ -24,6 +32,21 @@ env/ venv/ .venv/ pip-log.txt +*.egg-info/ +*.egg +dist/ +build/ +.pytest_cache/ +*.pyi.bak + +# TypeScript / Node SDK +sdk/typescript/dist/ +sdk/typescript/node_modules/ +sdk/typescript/*.tsbuildinfo +sdk/python/dist/ +sdk/python/build/ +sdk/python/.pytest_cache/ +sdk/python/hyperion_ai.egg-info/ # Project Data (Do not commit DBs) diff --git a/DEPLOY.md b/DEPLOY.md index 7edb71d..aaa8216 100644 --- a/DEPLOY.md +++ b/DEPLOY.md @@ -25,8 +25,8 @@ A complete Hyperion deployment consists of several specialized services working | Recommendation | CPU | RAM | Storage | |---|---|---|---| | **Minimum** | 2 Cores | 4 GB | 20 GB SSD | -| **Recommended** | 4 Cores | 8 GB | 100 GB NVMe | -| **High Traffic** | 8+ Cores | 16+ GB | 500+ GB NVMe | +| **Recommended** | 4 Cores | 8 GB | 60 GB NVMe | +| **High Traffic** | 8+ Cores | 16+ GB | 100+ GB NVMe | ## Standard Deployment (Docker Compose) @@ -75,7 +75,6 @@ Once deployed, verify the health of the individual components: ### Performance Tuning * **Redis Persistence:** For mission-critical deployments, enable AOF (Append Only File) in Redis to prevent cache data loss during restarts. * **ClickHouse Retention:** Configure ClickHouse TTL policies to manage storage costs for request logs (e.g., set to 30 days). -* **Embedding Models:** The `embedder` service is computationally intensive. If latency spikes during semantic lookups, consider moving the `embedder` to a GPU-enabled instance. ### Maintenance * **Updates:** Pull the latest images and restart with `docker-compose pull && docker-compose up -d`. Hyperion maintains backwards compatibility across minor versions. diff --git a/PLAN.md b/PLAN.md deleted file mode 100644 index 56620b2..0000000 --- a/PLAN.md +++ /dev/null @@ -1,106 +0,0 @@ -# Layered Budgets (Daily/Weekly/Monthly) — Org + User (Phase 1) - -## Summary -Implement production-grade layered budget enforcement for **orgs and users** with three windows: **daily, weekly, monthly**. -Per your requirement: -- `daily_limit` and `weekly_limit` are **nullable** and treated as **infinite** when unset. -- If a nullable limit is not configured, that window check is skipped. -- `monthly_limit` remains the primary enforced budget layer. - -Scope locked for this phase: **Org + User only** (no key-level layered budgets yet). - -## Budget Semantics -1. Enforcement order: `daily -> weekly -> monthly`. -2. Hard-stop behavior: first breached configured layer blocks request. -3. Null semantics: - - `daily_limit_minor = NULL` => skip daily check. - - `weekly_limit_minor = NULL` => skip weekly check. - - `monthly_limit_minor = NULL` => unlimited monthly (allowed but discouraged for paid orgs). -4. Period boundaries: - - Daily: UTC day. - - Weekly: Monday 00:00 UTC. - - Monthly: calendar month UTC. - -## Backend Changes - -### 1) Data model and migrations -1. Extend org budget storage to include: - - `daily_limit_minor_units BIGINT NULL` - - `weekly_limit_minor_units BIGINT NULL` - - `monthly_limit_minor_units BIGINT NULL` (existing may be reused) -2. Extend member budgets table similarly: - - `daily_limit_minor_units BIGINT NULL` - - `weekly_limit_minor_units BIGINT NULL` - - `monthly_limit_minor_units BIGINT NULL` -3. Add periodized usage keys/counters for each scope/window: - - Org: day/week/month spend counters - - User: day/week/month spend counters -4. Add indexes/keys for efficient period resolution and atomic increments. - -### 2) Repositories and domain types -1. Introduce layered budget structs: - - `BudgetLimits{Daily *int64, Weekly *int64, Monthly *int64}` - - `BudgetUsage{Daily int64, Weekly int64, Monthly int64}` -2. Update budget repos to read/write nullable limits cleanly. -3. Preserve old monthly-only methods as wrappers during transition, then remove once callers migrate. - -### 3) Enforcement path -1. Update org budget checks (`billing/manager` + auth path) to: - - Load limits once. - - Resolve current period usage for day/week/month. - - Evaluate configured windows only. - - Return machine-readable breach reason: `daily_budget_exceeded|weekly_budget_exceeded|monthly_budget_exceeded`. -2. Update member budget checks in `tenant_auth.go` similarly. -3. Keep rate limiting logic separate; do not couple with budget checks. - -### 4) API surface -1. Update org budget APIs to accept/return layered limits: - - `daily_limit`, `weekly_limit`, `monthly_limit` (nullable) -2. Update member budget APIs similarly: - - `PUT/GET /v1/admin/members/{id}/budget` -3. Response payload should include both limits and current usage per window. - -### 5) Error contracts and observability -1. Standardize budget denial response: - - `code`, `message`, `breached_window`, `limit_minor`, `usage_minor`, `scope`. -2. Add response header on budget deny: - - `X-Budget-Window: daily|weekly|monthly` -3. Emit structured log event for denies with scope/window/tenant/user/request id. - -## Compatibility and Defaults -1. Existing tenants with only monthly configured: - - Monthly remains active. - - Daily/weekly default to `NULL` (skipped). -2. No behavior break for orgs/users with unlimited budgets (null monthly). -3. No key-layer changes in this phase. - -## Tests - -### Unit -1. Null daily/weekly => checks skipped. -2. Daily set + exceeded => denied before weekly/monthly. -3. Weekly set + exceeded (daily unset/not exceeded) => denied. -4. Monthly set + exceeded => denied. -5. UTC boundary rollover correctness for day/week/month windows. - -### Integration -1. Org layered budget enforcement with real reserve/usage increments. -2. Member layered budget enforcement under concurrent requests. -3. API read/write roundtrip with nullable fields. -4. Legacy monthly-only orgs still enforce monthly correctly. - -### Regression -1. Rate-limit paths unchanged. -2. Existing billing/reserve flow remains atomic and consistent. - -## Rollout -1. Ship migration + dual-read compatible repo changes. -2. Enable layered enforcement code path. -3. Backfill existing rows (`daily=NULL, weekly=NULL`). -4. Monitor deny rates by `breached_window`. - -## Assumptions -1. Currency storage remains minor units. -2. UTC is accepted for all period boundaries. -3. Hard-stop behavior is desired for all configured windows. -4. Key-level layered budgets are deferred to phase 2. diff --git a/README.md b/README.md index 17ab26a..ae524a7 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,32 @@ # Hyperion [![License: AGPL v3](https://img.shields.io/badge/license-AGPL--3.0-blue.svg)](LICENSE) -[![Status](https://img.shields.io/badge/status-beta-orange.svg)](#) -[![RPS](https://img.shields.io/badge/throughput-21k%2B%20RPS-brightgreen)](#performance) -[![Overhead](https://img.shields.io/badge/overhead-5%C2%B5s%20median-brightgreen)](#performance) +[![Docs](https://img.shields.io/badge/docs-available-blue.svg)](docs/) +[![Blog](https://img.shields.io/badge/blog-read-blue.svg)](https://hyperionhq.co/blog) +[![Website](https://img.shields.io/badge/website-hyperionhq.co-blue.svg)](https://hyperionhq.co) -**LLM caching and orchestration for teams who can't afford to be slow.** +[Quick Start](docs/quick-start.md) • [Documentation](docs/) • [Examples](docs/examples) • [Twitter](https://twitter.com/hyperionhq) • [Website](https://hyperionhq.co) • [Blog](https://hyperionhq.co/blog) -Hyperion sits between your application and your LLM providers. Identical requests return in microseconds. Semantically equivalent ones are caught by vector search. Everything else gets routed to the cheapest model capable of answering it—automatically. +**⚡ <5µs Overhead • 2-Layer Caching • Smart Routing • Budget Controls • Spend Forecasting • Key Scoping • Real-time Analytics** -**20,000+ RPS. 5µs median overhead. Zero dropped requests.** +**LLM caching and orchestration for teams who can't afford a single millisecond of overhead.** -At that scale, Hyperion gets out of the way. It supports OpenAI, Anthropic, Google Gemini, Mistral, DeepSeek, Cohere and more—unified behind a single OpenAI-compatible endpoint with no client-side changes required. +Hyperion sits directly between your application and your LLM providers. It intercepts identical requests and returns them in **microseconds**. Semantically equivalent queries bypass the LLM entirely via vector search. Everything else gets routed to the cheapest capable model dynamically. + +### 🚀 21,000 RPS. 5µs median gateway overhead. Zero dropped requests. + +At that scale, Hyperion is effectively invisible to your stack. It unifies OpenAI, Anthropic, Google Gemini, Mistral, DeepSeek, and Cohere behind a single high-throughput OpenAI-compatible endpoint. No client-side rewrites required. What you get beyond just proxying: - **Semantic + exact-match caching** — two-tier cache with Redis and Qdrant vector search -- **Smart model routing** — ML classifier that routes by complexity and budget burn rate. -- **Budget enforcement** — hard spend limits per API key, user, or organization with auto-cutoff -- **Key scoping** — issue scoped keys with per-key rate limits, quotas, allowed models and TTLs -- **Spend forecasting** — spend forecasting to predict and flag budget overruns before they happen -- **Anomaly detection** — automatic spike detection and key suspension on unusual usage +- **Smart model routing** — ML classifier routes queries by complexity and budget burn rate +- **Budget enforcement** — hard spend limits per API key with auto-cutoff +- **Spend forecasting** — predictive forecasting to flag budget overruns before they happen +- **Key scoping** — issue scoped keys with per-key rate limits, quotas, and allowed models - **Provider failover** — transparent rerouting across providers on errors or rate limits - **Real-time analytics** — cost, latency, cache hit rates, and provider health in the dashboard -- **Distributed tracing** — full request traces via OpenTelemetry and Jaeger -- **Admin API** — full programmatic control over keys, budgets, tenants, and cache +- **Admin API** — full programmatic control over keys, budgets, and cache --- @@ -34,81 +36,73 @@ What you get beyond just proxying: git clone https://github.com/hyperionhq/hyperion-gateway.git cd hyperion-gateway cp .env.example .env -docker-compose up -d --build +docker compose up -d --build ``` -Make a request through Hyperion: +Make a request through Hyperion using the official Python SDK: -```bash -curl http://localhost:8080/v1/chat/completions \ - -H "Authorization: Bearer your-key" \ - -H "Content-Type: application/json" \ - -d '{"model": "openai/gpt-5.2", "messages": [{"role": "user", "content": "Hello"}]}' +```python +from hyperion import HyperionClient + +client = HyperionClient( + api_key="your-key", + base_url="http://localhost:8080/v1" +) + +response = client.chat.completions.create( + model="openai/gpt-5.2", + messages=[{"role": "user", "content": "Hello"}] +) + +print(response.choices[0].message.content) ``` -Your existing OpenAI client code works without modification. Change the base URL, keep everything else. +The Hyperion SDK is a lightweight wrapper over the standard OpenAI client. Your existing code works without modification. | Interface | URL | |---|---| | API Endpoint | `http://localhost:8080/v1` | -| Admin Dashboard | `http://localhost:3000` | | Swagger UI | `http://localhost:8080/swagger/index.html` | -| Admin Health | `http://localhost:8080/v1/admin/health` | +| Health Check | `http://localhost:8080/v1/health` | --- ## Performance -Benchmarked under 10,000 requests at 10 concurrent workers against the full live stack: +Benchmarked under 10,000 requests at 10 concurrent workers against the full live stack. +Methodology and reproducible steps: [Benchmarking Guide](docs/benchmarking/getting-started.md). -| Metric | Result | +| Metric ([methodology](docs/benchmarking/getting-started.md)) | Result | |--------|--------| | Throughput | **21,646 RPS** | | Successful requests | **10,000 / 10,000** (100%) | | Failed requests | **0** | | RTT — Average | **0.456 ms** | | RTT — Median | **0.363 ms** | -| RTT — p95 | **1.079 ms** | -| RTT — p99 | **1.894 ms** | | Gateway overhead — Median | **5 µs** | -| Gateway overhead — Average | **12.2 µs** | -| Gateway overhead — p95 | **26 µs** | -| Gateway overhead — p99 | **134 µs** | -All overhead is measured as pure dispatch time: JSON parse + internal routing + JSON marshal. The gateway adds no perceptible latency at scale. +All overhead is measured as pure gateway dispatch time (request receive → routing decision → upstream dispatch). Full methodology and reproducible tests: [Run Your Own Benchmarks](docs/benchmarking/run-your-own-benchmarks.md). --- -## What it does - -**Multi-layer response caching** - -- **L1 (Redis):** Sub-millisecond exact-match. Byte-identical requests return before the connection reaches upstream. -- **L2 (Semantic):** Vector similarity on prompt embeddings via Qdrant. Catches conceptually equivalent queries that differ in phrasing. - -**Intelligent routing** - -An ML classifier runs per-request to assign complexity scores. Simple queries route to cheap models. Complex reasoning gets heavier models. No configuration—it adapts to your traffic. - -**Budget enforcement** - -Hard ceilings at the key, user, or org level. When a limit is hit, traffic stops. No "soft warnings" that turn into unexpected invoices. +## Capabilities -**Security** +### ⚡ Multi-Layer Response Caching +- **L1 (Redis) - ~0.5ms:** Sub-millisecond exact-match. Byte-identical requests never touch the external internet. They are served directly from RAM before the upstream connection is even initiated. +- **L2 (Semantic) - ~15ms:** Vector similarity on prompt embeddings via Qdrant. Catches conceptually equivalent queries that differ in phrasing ("What is the speed of light" vs "How fast does light travel"). -Upstream credentials are encrypted at rest (AES-256-GCM). Keys are never logged or stored in plaintext. Built-in PII filters scrub sensitive data before it reaches the cache or any upstream. +### Intelligent Routing +An ML classifier runs per-request to assign complexity scores. Simple queries route to cheap models. Complex reasoning gets heavier models. No configuration—it adapts to your traffic dynamically. -**Observability** +### Budget Enforcement & Security +Hard ceilings at the key level. When a limit is hit, traffic stops. Upstream credentials are encrypted at rest (AES-256-GCM). Keys are never logged or stored in plaintext. PII filters scrub sensitive data before it reaches the cache or any upstream. -ClickHouse captures every request at throughput. The dashboard surfaces cost, latency, cache hit rates, and provider health in real time. Full distributed tracing via Jaeger across every service component. - -**Multi-tenancy** - -Organizations, users, API keys, rate limits, and per-key budgets managed through the admin API. One deployment, fully isolated tenants. +### Observability +ClickHouse captures every request at throughput. The dashboard surfaces cost, latency, cache hit rates, and provider health in real time. --- -## Architecture +## Architecture Context ```text Client @@ -121,19 +115,12 @@ Client └─ Upstream: OpenAI / Anthropic / Gemini / Mistral / DeepSeek / ... ``` ---- - -## Repository - -```text -hyperion-gateway/ -├── gateway/ # Core proxy in Go -├── embedder/ # Embedding service (Python / gRPC) -├── intelligence/ # ML routing and anomaly detection (Python) -├── predictor/ # Cache warming predictor (Python) -├── dashboard/ # Admin UI (React) -└── landing/ # Marketing site -``` +- **gateway/**: Core proxy in Go +- **embedder/**: Embedding service (Python / gRPC) +- **intelligence/**: ML routing and anomaly detection (Python) +- **predictor/**: Cache warming predictor (Python) +- **dashboard/**: Admin UI (React) +- **landing/**: Marketing site --- @@ -141,10 +128,24 @@ hyperion-gateway/ - [Deployment Guide](DEPLOY.md) — production setup, hardware requirements, SSL. - [Contributing](CONTRIBUTING.md) — development setup, coding standards, PRs. +- [Python SDK Examples](docs/examples/python) — runnable Python usage examples. +- [TypeScript SDK Examples](docs/examples/typescript) — runnable TypeScript usage examples. +- [Benchmark Methodology](docs/benchmarking/getting-started.md) — performance testing approach and profiles. - [API Reference](gateway/docs/swagger.json) — full OpenAPI specification. --- +## Community & Support + +Have questions, found a bug, or want to contribute? + +- **[GitHub Issues](https://github.com/hyperionhq/hyperion-gateway/issues)** — Report bugs and request features. +- **[GitHub Discussions](https://github.com/hyperionhq/hyperion-gateway/discussions)** — Ask questions and share ideas. +- **[Contributing Guide](CONTRIBUTING.md)** — Learn how to contribute to Hyperion. +- **[Blog](https://hyperionhq.co/blog)** — Read in-depth posts on caching, routing, and optimization. + +--- + ## License AGPL-3.0. See [LICENSE](LICENSE). For commercial licensing, contact sales@hyperionhq.co. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..68aef87 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,13 @@ +# Security Policy + +## Supported Versions + +Only the latest `main` branch or latest tagged release is actively supported for security updates. + +## Reporting a Vulnerability + +We take the security of Hyperion very seriously. If you discover a security vulnerability, please **do not** open a public issue. + +Instead, please send an email to **security@hyperionhq.co** with a thorough description of the issue, including steps to reproduce it. + +We will acknowledge receipt of your vulnerability report within 48 hours and strive to send you regular updates about our progress. If the vulnerability is accepted, we will coordinate the public disclosure with you to ensure a safe fix is rolled out. diff --git a/cache_design_doc.md b/cache_design_doc.md deleted file mode 100644 index 6196398..0000000 --- a/cache_design_doc.md +++ /dev/null @@ -1,798 +0,0 @@ -# Cost-Aware Inference Cache & Result Deduplication Layer -## High-Level Design & Architecture Document - ---- - -## 1. System Overview - -### Purpose -Build a multi-tier caching system that deduplicates LLM inference requests, reduces token consumption by 40%, and provides secure multi-tenant isolation while handling 100k+ QPS across millions of cached items. - -### Key Capabilities -- **Exact Match Caching**: Instant retrieval for identical prompts -- **Semantic Deduplication**: Match similar prompts using embedding-based search -- **Multi-Tenant Privacy**: Secure isolation with encryption and access controls -- **Content-Addressed Storage**: Deduplicate responses across tenants -- **Intelligent Eviction**: LRU+LFU hybrid policy for optimal cache utilization -- **Cost Tracking**: Real-time token savings and attribution per tenant - ---- - -## 2. High-Level Architecture - -### System Layers - -``` -┌─────────────────────────────────────────────────────────────┐ -│ API Gateway Layer │ -│ │ -│ • Rate limiting and request throttling │ -│ • Authentication and tenant identification │ -│ • Request normalization and hash generation │ -│ • Privacy validation and PII detection │ -└─────────────────────────────────────────────────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────┐ -│ Cache Lookup & Deduplication Layer │ -│ │ -│ ┌────────────────────────────────────────────────────────┐ │ -│ │ L1: Exact Match Cache (Redis) │ │ -│ │ • SHA-256 hash lookup │ │ -│ │ • Sub-millisecond response │ │ -│ │ • 1-6 hour TTL, hot data only │ │ -│ └────────────────────────────────────────────────────────┘ │ -│ ↓ MISS │ -│ ┌────────────────────────────────────────────────────────┐ │ -│ │ L2: Semantic Search Cache (Tile38) │ │ -│ │ • Embedding-based similarity search │ │ -│ │ • Configurable similarity threshold (0.90-0.98) │ │ -│ │ • 6-24 hour TTL, warm data │ │ -│ └────────────────────────────────────────────────────────┘ │ -│ ↓ MISS │ -│ ┌────────────────────────────────────────────────────────┐ │ -│ │ L3: Content-Addressed Store (S3) │ │ -│ │ • Long-term storage and deduplication │ │ -│ │ • 7-30 day retention with lifecycle policies │ │ -│ │ • Promotion to L1/L2 on access │ │ -│ └────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ - ↓ ALL MISS ↓ ANY HIT -┌──────────────────┐ ┌──────────────────┐ -│ LLM Provider │ │ Return Cached │ -│ (GPT/Claude) │ │ Response │ -└──────────────────┘ └──────────────────┘ - ↓ -┌─────────────────────────────────────────────────────────────┐ -│ Response Processing & Storage │ -│ │ -│ • Compress with Zstd (level 3) │ -│ • Encrypt with tenant-specific key (AES-256-GCM) │ -│ • Generate content hash for deduplication │ -│ • Store in all cache tiers (L1, L2, L3) │ -│ • Update metrics and cost tracking │ -└─────────────────────────────────────────────────────────────┘ -``` - -### Data Flow - -**Cache Hit Path:** -1. Request arrives with prompt, model, parameters -2. Normalize and generate exact hash -3. Check L1 (Redis) → Return if hit (< 1ms) -4. Generate embedding and check L2 (Tile38) → Return if similarity > threshold (< 10ms) -5. Check L3 (S3) by content hash → Promote to L1/L2 and return (< 100ms) -6. Record metrics: cache hit type, latency, tokens saved - -**Cache Miss Path:** -1. Forward request to LLM provider -2. Receive response and measure token usage -3. Compress response (Zstd level 3) -4. Encrypt with tenant KEK (AES-256-GCM) -5. Generate content hash (SHA-256) -6. Store in L1, L2, L3 with appropriate TTLs -7. Record metrics: cache miss, token cost - ---- - -## 3. Core Components Design - -### 3.1 Request Processing Pipeline - -**Input Data Structure:** -- Tenant ID (for isolation) -- User ID (for audit) -- Prompt text (query to LLM) -- Model identifier (gpt-4.1-nano, claude-3, etc.) -- Generation parameters (temperature, max_tokens) -- System prompt (optional) - -**Normalization Steps:** -1. Trim leading/trailing whitespace -2. Apply case normalization (configurable per tenant) -3. Canonicalize JSON parameters (sorted keys) -4. Round floating-point values (temperature to 2 decimals) -5. Generate deterministic hash (SHA-256) - -**Hash Generation:** -- Exact hash: SHA-256 of normalized input -- Semantic hash: Vector embedding (384 dimensions) -- Content hash: SHA-256 of response body - -### 3.2 Multi-Tier Cache Architecture - -**L1: Redis Hot Cache** -- Purpose: Exact match lookups for frequently accessed queries -- Key structure: `tenant:{tenant_id}:exact:{hash}` -- Value: Compressed, encrypted response blob + metadata -- TTL: 1-6 hours (extended on access) -- Eviction: LRU+LFU hybrid policy -- Target hit rate: 20-30% of requests - -**L2: Tile38 Semantic Cache** -- Purpose: Similarity-based matching for related queries -- Storage: Vector embeddings with geospatial indexing -- Query method: NEARBY search with distance threshold -- TTL: 6-24 hours -- Similarity thresholds: - - Generic Q&A: 0.98 (very strict) - - Code generation: 0.92 (more flexible) - - Data analysis: 0.95 (moderate) - - Creative writing: 0.88 (loose) -- Target hit rate: 10-20% of requests (after L1 miss) - -**L3: S3 Content-Addressed Store** -- Purpose: Long-term deduplication and archival -- Path structure: `{content_hash[:2]}/{content_hash}` -- Deduplication: Store each unique response once -- Reference counting: Track usage across tenants -- Lifecycle: Move to Glacier after 30 days of no access -- Target hit rate: 5-10% of requests (after L1+L2 miss) - -### 3.3 Semantic Similarity Engine - -**Embedding Generation:** -- Model: Sentence Transformers (all-MiniLM-L6-v2) -- Dimensions: 384 -- Normalization: L2 normalized vectors -- Inference: ONNX Runtime for optimization -- Batching: Process up to 32 prompts per batch -- Caching: Cache embeddings for 1 hour - -**Similarity Search:** -- Algorithm: Approximate Nearest Neighbor (ANN) -- Index: Tile38 geospatial index (treats vectors as coordinates) -- Distance metric: Cosine similarity -- Search radius: 1 - similarity_threshold -- Result limit: Top 1 match above threshold - -**Threshold Tuning:** -- A/B testing framework for threshold optimization -- Per-tenant and per-use-case customization -- Feedback loop from user corrections -- Metrics: false positive rate, token savings - -### 3.4 Privacy & Multi-Tenant Isolation - -**Tenant Isolation Modes:** - -1. **PRIVATE (default)** - - No cross-tenant cache sharing - - Tenant-specific key namespace - - Full encryption at rest - - Dedicated audit logs - -2. **SHARED_ANONYMOUS** - - Share cache after PII removal - - Strip identifiable information - - Shared namespace for common queries - - Opt-in via tenant settings - -3. **SHARED_OPT_IN** - - Explicitly marked queries only - - User consent required - - Metadata tracking for attribution - -**Security Measures:** -- Tenant KEK (Key Encryption Key) per tenant -- KEK storage: AWS KMS or HashiCorp Vault -- Data encryption: AES-256-GCM -- Key rotation: 90-day automatic rotation -- Access control: RBAC policies per tenant -- Audit logging: All cache access events - -**PII Detection:** -- Named Entity Recognition (NER) for persons, organizations -- Pattern matching for emails, phone numbers, SSNs -- Credit card number detection -- Custom regex patterns per tenant -- Action: Block from shared cache if PII detected - -### 3.5 Eviction Policy (LRU + LFU Hybrid) - -**Scoring Algorithm:** -``` -Score = (Frequency_Score × 0.6) + (Recency_Score × 0.4) - -Frequency_Score = hit_count × exp(-age_in_days) -Recency_Score = 1 / (1 + hours_since_last_access) -``` - -**Eviction Triggers:** -- Memory pressure > 80%: Evict bottom 10% by score -- Memory pressure > 90%: Evict bottom 20% by score -- Manual eviction: API endpoint for cache clearing - -**Special Handling:** -- Always keep top 5% LFU items (pinned) -- Never evict items accessed in last 5 minutes -- Probabilistic eviction to prevent thundering herd -- Graceful degradation: Demote to L2/L3 before deletion - -### 3.6 Compression & Encryption - -**Compression Strategy:** -- Primary: Zstd (level 3) - - Ratio: 60-70% size reduction - - Speed: 200MB/s compression -- Fallback: LZ4 - - Ratio: 50% size reduction - - Speed: 500MB/s compression -- Decision: Use Zstd for responses > 10KB, LZ4 for smaller - -**Encryption Pipeline:** -1. Retrieve tenant KEK from KMS -2. Generate data encryption key (DEK) per item -3. Encrypt response with AES-256-GCM -4. Encrypt DEK with KEK (envelope encryption) -5. Store encrypted DEK alongside encrypted data -6. Discard DEK from memory - ---- - -## 4. Building Plan - -### Phase 1: Foundation (Weeks 1-2) - -**Objective:** Basic exact-match caching with single-tenant support - -**Infrastructure Setup:** -- Deploy Redis cluster (3-node cluster for high availability) -- Set up Prometheus and Grafana for metrics -- Configure logging infrastructure (ELK or CloudWatch) - -**Core Development:** -- Build request normalization pipeline -- Implement SHA-256 hash generation -- Create cache middleware layer -- Add Zstd compression -- Implement basic metrics collection - -**Success Criteria:** -- Exact match cache functional -- 30%+ hit ratio on test workload -- < 1ms cache lookup latency (p95) -- Compression achieving 60%+ size reduction - -### Phase 2: Semantic Deduplication (Weeks 3-4) - -**Objective:** Add semantic similarity matching capability - -**Infrastructure Setup:** -- Deploy Tile38 cluster (2-node setup) -- Set up embedding service infrastructure -- Configure GPU instances (optional, for scale) - -**Core Development:** -- Integrate Sentence Transformer model -- Build embedding generation service with batching -- Implement Tile38 NEARBY search -- Create similarity threshold configuration system -- Add semantic match fallback in cache lookup - -**Success Criteria:** -- Semantic cache operational -- 10-15% additional hit ratio improvement -- < 10ms semantic search latency (p95) -- Configurable thresholds working per tenant - -### Phase 3: Multi-Tenant & Privacy (Weeks 5-6) - -**Objective:** Secure multi-tenant isolation and encryption - -**Infrastructure Setup:** -- Configure AWS KMS or HashiCorp Vault -- Set up audit logging infrastructure -- Deploy PII detection service - -**Core Development:** -- Implement tenant key namespacing -- Build encryption/decryption layer -- Create PII detection pipeline -- Implement shared cache with anonymization -- Add access control and RBAC -- Build audit logging system - -**Success Criteria:** -- 100% tenant isolation verified -- PII detection blocking sensitive data -- Encryption/decryption overhead < 5ms -- Complete audit trail for all access - -### Phase 4: Content-Addressed Storage (Weeks 7-8) - -**Objective:** Add long-term cache tier with deduplication - -**Infrastructure Setup:** -- Configure S3 bucket with lifecycle policies -- Set up cross-region replication (optional) -- Configure CloudFront for faster retrieval (optional) - -**Core Development:** -- Implement content hashing (SHA-256) -- Build reference counting system -- Create L3 → L1 promotion logic -- Implement garbage collection for unreferenced items -- Add cost tracking and attribution per tenant - -**Success Criteria:** -- 30-day retention functional -- 40%+ storage savings from deduplication -- < 100ms L3 cache hit latency (p95) -- Reference counting accurate - -### Phase 5: Advanced Eviction & Optimization (Weeks 9-10) - -**Objective:** Intelligent eviction and performance optimization - -**Core Development:** -- Implement LRU+LFU hybrid scoring algorithm -- Build memory pressure monitoring -- Create adaptive TTL system based on access patterns -- Implement cache warming for predicted queries -- Add A/B testing framework for cache policies -- Build cost attribution and savings dashboard - -**Success Criteria:** -- 20%+ improvement in cache efficiency -- Adaptive TTLs reducing memory usage -- A/B testing framework operational -- Real-time cost dashboard functional - -### Phase 6: Scale Testing & Hardening (Weeks 11-12) - -**Objective:** Validate 100k+ QPS capability and production readiness - -**Testing Activities:** -- Load testing with 100k+ QPS using Locust or K6 -- Chaos engineering (simulate node failures) -- Stress test cache eviction under memory pressure -- Benchmark semantic search at scale -- Test encryption performance under load - -**Production Hardening:** -- Implement circuit breakers for LLM and cache failures -- Add request coalescing for thundering herd prevention -- Build cache warming scripts for deployment -- Create monitoring dashboards and alerts -- Write runbooks for incident response -- Set up on-call rotation and escalation - -**Success Criteria:** -- 100k+ QPS sustained throughput -- 99.9% cache availability (< 43min downtime/month) -- < 5ms p99 latency under full load -- Zero data loss during node failures -- Complete monitoring and alerting coverage - ---- - -## 5. Key Data Structures - -### Cache Request -``` -{ - "tenant_id": "tenant_abc123", - "user_id": "user_xyz789", - "prompt": "What is the capital of France?", - "model": "gpt-4.1-nano", - "temperature": 0.7, - "max_tokens": 1000, - "system_prompt": "You are a helpful assistant" -} -``` - -### Cache Key Structure -``` -Exact Match: - tenant:{tenant_id}:exact:{sha256_hash} - -Semantic Match: - cache:semantic:{tenant_id}:{hash} - -Content-Addressed: - s3://cache-bucket/{content_hash[:2]}/{content_hash} -``` - -### Stored Cache Item -``` -{ - "response": "base64_encoded_compressed_encrypted_blob", - "metadata": { - "model": "gpt-4.1-nano", - "tokens_used": 1250, - "created_at": 1704825600, - "last_accessed": 1704826000, - "hit_count": 15, - "tenant_id": "tenant_abc123", - "content_hash": "sha256_of_response", - "compression": "zstd", - "encryption": "aes-256-gcm" - } -} -``` - -### Embedding Record (Tile38) -``` -SET cache:semantic {tenant_id}:{hash} - FIELD embedding [0.123, 0.456, ..., 0.789] // 384 dimensions - FIELD prompt_hash {exact_hash} - FIELD response_ref s3://bucket/{content_hash} - FIELD model gpt-4.1-nano - FIELD created_at 1704825600 - EX 86400 // 24 hour TTL -``` - ---- - -## 6. Metrics & Monitoring - -### Primary KPIs - -**Cache Performance:** -- Cache hit ratio (total): Target 40-60% - - L1 exact hits: 20-30% - - L2 semantic hits: 10-20% - - L3 content-addressed hits: 5-10% -- Cache miss rate: 40-60% - -**Latency Targets:** -- L1 cache hit: < 1ms (p95) -- L2 cache hit: < 10ms (p95) -- L3 cache hit: < 100ms (p95) -- End-to-end with LLM: < 2000ms (p95) - -**Availability:** -- Cache service uptime: 99.9%+ -- Redis cluster availability: 99.95% -- Tile38 availability: 99.9% -- S3 availability: 99.99% - -**Business Metrics:** -- Token savings per day (target: 40% reduction) -- Cost savings per tenant -- Requests per second: 100k+ sustained -- Cache items stored: Millions - -### Monitoring Dashboards - -**Real-Time Operations:** -1. Requests per second (by tenant, by model) -2. Cache hit ratio breakdown (L1/L2/L3) -3. Latency percentiles (p50, p95, p99) -4. Memory utilization per Redis node -5. Eviction rate (items/second) -6. Error rate by component - -**Performance Analysis:** -1. Token savings per hour (cumulative) -2. Top cached queries (by hit count) -3. Cache efficiency by model type -4. Similarity threshold effectiveness -5. Compression ratio by response type -6. Encryption overhead distribution - -**Cost & Attribution:** -1. Token usage vs baseline (with/without cache) -2. Cost savings per tenant -3. Infrastructure cost breakdown -4. ROI calculation (savings - infra cost) -5. Storage usage trends (S3 growth) - -**Security & Compliance:** -1. Failed authentication attempts -2. PII detection triggers -3. Cross-tenant access attempts -4. Encryption key rotations -5. Audit log volume and completeness - -### Alerting Thresholds - -**Critical Alerts:** -- Cache availability < 99.9% (5min window) -- Error rate > 1% (1min window) -- P99 latency > 5000ms (5min window) -- Redis memory > 90% (immediate) -- Encryption key rotation failure (immediate) - -**Warning Alerts:** -- Cache hit ratio < 35% (1hr window) -- L1 hit ratio < 15% (1hr window) -- Eviction rate > 1000/sec (10min window) -- S3 storage growth > 50% week-over-week -- PII detection rate > 5% (1hr window) - ---- - -## 7. Operational Considerations - -### Deployment Strategy - -**Infrastructure as Code:** -- Terraform for all infrastructure provisioning -- Kubernetes manifests for service deployment -- Helm charts for Redis and Tile38 clusters -- GitOps workflow with ArgoCD - -**Deployment Phases:** -1. Deploy to dev environment -2. Run integration tests -3. Deploy to staging with production-like load -4. Canary deployment to 5% of production traffic -5. Gradual rollout to 100% over 24 hours -6. Monitor for 48 hours before marking stable - -**Rollback Plan:** -- Blue-green deployment for zero downtime -- Automated rollback if error rate > 0.5% -- Keep previous version running for 24 hours -- Database migrations backward compatible - -### Cache Warming Strategies - -**On Deployment:** -- Export top 10k queries from production -- Replay queries to warm cache -- Prioritize high-value queries (expensive models) - -**Continuous Warming:** -- Predict queries based on time-of-day patterns -- Pre-compute embeddings for common topics -- Monitor trending topics and pre-cache - -**Cold Start Mitigation:** -- Keep snapshot of hot cache in S3 -- Restore snapshot on new node startup -- Parallel warming across multiple workers - -### Disaster Recovery - -**Backup Strategy:** -- Redis RDB snapshots every 5 minutes -- S3 cross-region replication (optional) -- Tile38 AOF (Append Only File) persistence -- Metadata backup to separate database - -**Recovery Procedures:** -- Redis cluster: Automatic failover to replica -- S3: Read from replica region if primary unavailable -- Tile38: Restore from AOF, rebuild index if needed -- Complete rebuild: 2-4 hours for 10M items - -**Testing:** -- Monthly disaster recovery drills -- Simulate node failures in staging -- Validate backup restore procedures -- Measure recovery time objectives (RTO: 15min) - -### Capacity Planning - -**Growth Projections:** -- Requests growth: 20% month-over-month -- Cache items: 10M → 50M over 6 months -- Storage: 1TB → 10TB over 6 months - -**Scaling Triggers:** -- Redis memory > 70%: Add node to cluster -- CPU utilization > 60%: Vertical scaling -- Request rate > 80k QPS: Horizontal scaling -- S3 growth > 5TB: Review retention policies - -**Resource Allocation:** -- Redis: 128GB RAM per node, 3-5 nodes -- Tile38: 64GB RAM per node, 2-3 nodes -- Embedding service: 16GB RAM, GPU optional -- S3: Unlimited, with lifecycle management - ---- - -## 8. Challenges & Mitigation Strategies - -### Challenge 1: Semantic False Positives - -**Problem:** Similar prompts with different intents get incorrectly matched - -**Mitigation:** -- Context-aware embeddings (include model, temperature in embedding) -- Human-in-the-loop validation for borderline matches (0.90-0.93 similarity) -- User feedback mechanism to flag incorrect matches -- A/B testing to optimize thresholds per query type -- Separate thresholds for different domains (code vs Q&A vs creative) - -### Challenge 2: Cold Start Performance - -**Problem:** Empty cache after deployment or eviction leads to poor initial performance - -**Mitigation:** -- Pre-populate cache with common queries from analytics -- Implement cache warming on deployment -- Use probabilistic eviction (always keep 5% of LFU items) -- Build cache export/import for faster recovery -- Maintain warm standby cache for failover - -### Challenge 3: PII Leakage in Shared Cache - -**Problem:** Sensitive user data appearing in other tenants' cached responses - -**Mitigation:** -- Strict PII detection using NER and pattern matching -- Default to private cache for all tenants -- Explicit opt-in required for shared cache -- Redact common PII patterns before caching -- Separate shared cache namespace with enhanced auditing -- Regular compliance audits and penetration testing - -### Challenge 4: Thundering Herd on Cache Miss - -**Problem:** 1000 concurrent requests for same query → 1000 LLM calls - -**Mitigation:** -- Request coalescing: First request locks, others wait -- Distributed lock using Redis SETNX -- Timeout after 30 seconds, fallback to LLM -- Probabilistic early expiration to prevent TTL stampede -- Jittered retry logic for failed requests - -### Challenge 5: Memory Pressure at Scale - -**Problem:** Millions of cached items exceed Redis capacity - -**Mitigation:** -- Tiered eviction strategy (L3 → L2 → L1) -- Bloom filters for existence checks (reduce memory) -- Embedding quantization (384 dims → 96 dims) -- Archive to S3 after 24 hours of no access -- Separate hot and warm cache clusters -- Implement adaptive TTL based on access patterns - -### Challenge 6: Encryption Performance Overhead - -**Problem:** Encryption/decryption adds latency to cache operations - -**Mitigation:** -- Use hardware-accelerated AES (AES-NI) -- Batch encryption operations -- Cache decrypted responses in application memory (with TTL) -- Use authenticated encryption to avoid separate HMAC -- Consider envelope encryption to minimize KMS calls - -### Challenge 7: Cross-Region Latency - -**Problem:** Global users experience high latency for cache lookups - -**Mitigation:** -- Deploy Redis clusters in multiple regions -- Use geo-routing to nearest cache cluster -- Replicate L3 (S3) across regions with cross-region replication -- Implement eventual consistency for cache updates -- Cache embeddings regionally to reduce latency - -### Challenge 8: Cache Poisoning - -**Problem:** Malicious actors attempt to cache harmful responses - -**Mitigation:** -- Rate limiting on cache writes per tenant -- Content validation before caching -- Separate cache for untrusted tenants -- Monitor for anomalous caching patterns -- Implement cache purge API for incident response -- Audit all cache writes with source tracking - ---- - -## 9. Success Metrics Summary - -### Technical Metrics -- **Cache Hit Ratio:** 40-60% overall -- **Latency:** < 1ms (L1), < 10ms (L2), < 100ms (L3) at p95 -- **Availability:** 99.9%+ uptime -- **Throughput:** 100k+ QPS sustained -- **Scale:** Millions of cached items - -### Business Metrics -- **Token Savings:** 40% reduction in token consumption -- **Cost Savings:** Significant monthly savings vs baseline -- **ROI:** 100x+ (savings vs infrastructure cost) - -### Security Metrics -- **Tenant Isolation:** 100% (zero cross-tenant leaks) -- **PII Detection:** 99%+ accuracy -- **Encryption Coverage:** 100% of cached data -- **Audit Completeness:** 100% of access events logged - -### Operational Metrics -- **Deployment Frequency:** Weekly releases with zero downtime -- **Mean Time to Recovery (MTTR):** < 15 minutes -- **Change Failure Rate:** < 5% -- **Incident Response Time:** < 5 minutes to acknowledge - ---- - -## 10. Future Enhancements - -### Phase 7: Advanced Features (Post-Launch) - -**Predictive Caching:** -- Machine learning to predict upcoming queries -- Preemptive cache warming based on patterns -- Seasonal and time-of-day optimizations - -**Multi-Modal Support:** -- Cache images, audio, video responses -- Binary content deduplication -- Specialized compression for media types - -**Federated Caching:** -- Cross-organization cache sharing (with consent) -- Industry-specific shared caches -- Open-source knowledge base integration - -**Smart Cache Invalidation:** -- Detect when cached responses become stale -- LLM-based staleness detection -- Automatic refresh of outdated content - -**Advanced Analytics:** -- Query pattern analysis and clustering -- Cache optimization recommendations -- Anomaly detection for unusual patterns -- Cost optimization suggestions per tenant - ---- - -## Appendix: Technology Stack - -### Core Infrastructure -- **Redis 7.x**: In-memory hot cache (cluster mode) -- **Tile38 1.x**: Geospatial/vector similarity search -- **S3 / Object Storage**: Long-term content-addressed storage - -### Embedding & ML -- **Sentence Transformers**: all-MiniLM-L6-v2 (384 dimensions) -- **ONNX Runtime**: Optimized inference engine -- **PyTorch**: Model serving framework - -### Compression & Encryption -- **Zstd**: Primary compression (level 3) -- **LZ4**: Fallback fast compression -- **AES-256-GCM**: Authenticated encryption -- **AWS KMS / HashiCorp Vault**: Key management - -### Observability -- **Prometheus**: Metrics collection and aggregation -- **Grafana**: Dashboards and visualization -- **Jaeger**: Distributed tracing -- **ELK Stack / CloudWatch**: Log aggregation and analysis - -### Development & Deployment -- **Terraform**: Infrastructure as Code -- **Kubernetes**: Container orchestration -- **Helm**: Package management for K8s -- **ArgoCD**: GitOps continuous delivery -- **Docker**: Containerization - -### Testing & Quality -- **Locust / K6**: Load testing tools -- **Chaos Mesh**: Chaos engineering -- **pytest**: Unit and integration testing -- **SonarQube**: Code quality analysis \ No newline at end of file diff --git a/dashboard/src/components/layout/AppSidebar.tsx b/dashboard/src/components/layout/AppSidebar.tsx index d20ab6d..cc6a515 100644 --- a/dashboard/src/components/layout/AppSidebar.tsx +++ b/dashboard/src/components/layout/AppSidebar.tsx @@ -4,7 +4,6 @@ import { BarChart3, Key, FileText, - CreditCard, Settings, ChevronRight, Search, @@ -16,8 +15,13 @@ import { Database, Coins, TrendingUp, - Zap + Zap, + Shield, + Layers, + ChevronDown, + Check } from 'lucide-react'; +import { useState } from 'react'; import { NavLink, useLocation } from 'react-router-dom'; import { cn } from '@/lib/utils'; import { motion } from 'framer-motion'; @@ -27,6 +31,16 @@ import { TooltipTrigger, TooltipProvider } from '@/components/ui/tooltip'; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, +} from '@/components/ui/dropdown-menu'; +import { useAuth } from '@/providers/auth-provider'; +import { useTeamContext } from '@/providers/team-context'; interface NavItem { id: string; @@ -49,9 +63,10 @@ const WORKSPACE_NAV: NavItem[] = [ const PLATFORM_NAV: NavItem[] = [ { id: 'keys', icon: Key, label: 'API Keys', href: '/keys' }, + { id: 'access', icon: Shield, label: 'Members & Invites', href: '/access' }, + { id: 'teams-projects', icon: Layers, label: 'Teams & Projects', href: '/teams-projects' }, { id: 'providers', icon: Zap, label: 'Providers', href: '/providers' }, { id: 'cache', icon: Database, label: 'Cache Control', href: '/cache' }, - { id: 'billing', icon: CreditCard, label: 'Billing', href: '/billing' }, { id: 'pricing', icon: Coins, label: 'Cost Config', href: '/admin/pricing' }, //{ id: 'settings', icon: Settings, label: 'Settings', href: '/settings' }, ]; @@ -60,8 +75,6 @@ const ANALYTICS_NAV: NavItem[] = [ { id: 'usage', icon: Activity, label: 'Usage', href: '/analytics/usage' }, { id: 'cost', icon: DollarSign, label: 'Cost', href: '/analytics/cost' }, { id: 'projections', icon: TrendingUp, label: 'Projections', href: '/analytics/projections' }, - { id: 'performance', icon: BarChart3, label: 'Performance', href: '/analytics/performance' }, - ]; @@ -69,6 +82,27 @@ const ANALYTICS_NAV: NavItem[] = [ export function AppSidebar({ collapsed, onCollapse }: AppSidebarProps) { + const { user, role, memberships, switchOrg } = useAuth(); + const { resetFilters } = useTeamContext(); + const [isSwitchingOrg, setIsSwitchingOrg] = useState(false); + + const membershipsForSelect = memberships.length > 0 + ? memberships + : (user?.org_id ? [{ org_id: user.org_id, org_name: 'Current Organization', org_slug: '', role: role || 'viewer', status: 'active', is_owner: role === 'owner' }] : []); + + const activeOrg = membershipsForSelect.find((membership) => membership.org_id === user?.org_id); + + const handleOrgChange = async (nextOrgId: string) => { + if (!nextOrgId || nextOrgId === user?.org_id) return; + try { + setIsSwitchingOrg(true); + await switchOrg(nextOrgId); + resetFilters(); + } finally { + setIsSwitchingOrg(false); + } + }; + return (