diff --git a/cloudflare-worker/.env.example b/cloudflare-worker/.env.example new file mode 100644 index 00000000..3ebcd44b --- /dev/null +++ b/cloudflare-worker/.env.example @@ -0,0 +1,19 @@ +# Cloudflare Worker Environment Configuration +# +# Copy this file to .env and fill in your values +# These are only needed for local development with `wrangler dev` + +# Cloudflare Account ID +# Find at: https://dash.cloudflare.com/ -> select account -> Account ID in sidebar +CLOUDFLARE_ACCOUNT_ID=your_account_id_here + +# D1 Database ID +# Get from: wrangler d1 create voicemode-telemetry +CLOUDFLARE_DATABASE_ID=your_d1_database_id_here + +# KV Namespace ID +# Get from: wrangler kv:namespace create "RATE_LIMITS" +CLOUDFLARE_KV_NAMESPACE_ID=your_kv_namespace_id_here + +# Deployed Worker URL (after deployment) +WORKER_URL=https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev diff --git a/cloudflare-worker/.gitignore b/cloudflare-worker/.gitignore new file mode 100644 index 00000000..1cadf37f --- /dev/null +++ b/cloudflare-worker/.gitignore @@ -0,0 +1,19 @@ +# Environment variables +.env + +# Wrangler outputs +.wrangler/ +worker/ + +# Node modules (if using local development) +node_modules/ + +# IDE +.vscode/ +.idea/ + +# MacOS +.DS_Store + +# Logs +*.log diff --git a/cloudflare-worker/ARCHITECTURE.md b/cloudflare-worker/ARCHITECTURE.md new file mode 100644 index 00000000..aa753f77 --- /dev/null +++ b/cloudflare-worker/ARCHITECTURE.md @@ -0,0 +1,469 @@ +# VoiceMode Telemetry Architecture + +## Request Flow + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ 1. VoiceMode Client (Python) │ +│ - Collects usage data from logs │ +│ - Generates event_id (SHA-256 hash) │ +│ - Sends POST /telemetry │ +└────────────────────────┬────────────────────────────────────────────────┘ + │ + │ HTTP POST + │ Content-Type: application/json + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 2. Cloudflare Edge Network │ +│ - DDoS protection │ +│ - TLS termination │ +│ - Routes to nearest worker instance │ +└────────────────────────┬────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 3. Worker: Validate Payload │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ ✓ Check event_id exists and is string │ │ +│ │ ✓ Check telemetry_id is valid UUID │ │ +│ │ ✓ Check timestamp is ISO 8601 and within 90 days │ │ +│ │ ✓ Check environment object exists │ │ +│ │ ✓ Check usage object exists │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ├─── Invalid? ──────┐ │ +│ │ ▼ │ +│ │ ┌──────────────────┐ │ +│ │ │ Return 400 │ │ +│ │ │ {error: "..."} │ │ +│ │ └──────────────────┘ │ +│ │ │ +│ ▼ Valid │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 4. Worker: Check Idempotency (D1 Database) │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ SELECT event_id FROM events WHERE event_id = ? │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ├─── Exists? ───────┐ │ +│ │ ▼ │ +│ │ ┌──────────────────┐ │ +│ │ │ Return 200 │ │ +│ │ │ {status: "ok", │ │ +│ │ │ message: "..." }│ │ +│ │ └──────────────────┘ │ +│ │ │ +│ ▼ New Event │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 5. Worker: Check Rate Limits (KV Store) │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ A. Check telemetry_id rate limit │ │ +│ │ Key: rate:id:{telemetry_id} │ │ +│ │ Limit: 10 events/hour │ │ +│ │ TTL: 3600 seconds │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ├─── Exceeded? ─────┐ │ +│ │ ▼ │ +│ │ ┌──────────────────┐ │ +│ │ │ Return 429 │ │ +│ │ │ {error: "...", │ │ +│ │ │ retry_after: │ │ +│ │ │ 3600} │ │ +│ │ └──────────────────┘ │ +│ │ │ +│ ▼ OK │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ B. Check IP rate limit │ │ +│ │ Key: rate:ip:{hashed_ip} │ │ +│ │ Limit: 100 events/hour │ │ +│ │ TTL: 3600 seconds │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ├─── Exceeded? ─────┐ │ +│ │ ▼ │ +│ │ ┌──────────────────┐ │ +│ │ │ Return 429 │ │ +│ │ │ (same as above) │ │ +│ │ └──────────────────┘ │ +│ │ │ +│ ▼ OK │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 6. Worker: Hash IP Address │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ client_ip = request.headers.get('CF-Connecting-IP') │ │ +│ │ hashed_ip = SHA256(client_ip).substring(0, 16) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 7. Worker: Store Event (D1 Database) │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ INSERT INTO events ( │ │ +│ │ event_id, │ │ +│ │ telemetry_id, │ │ +│ │ timestamp, │ │ +│ │ environment, -- JSON string │ │ +│ │ usage, -- JSON string │ │ +│ │ client_ip_hash, -- SHA-256 hash │ │ +│ │ created_at -- Server timestamp │ │ +│ │ ) VALUES (?, ?, ?, ?, ?, ?, ?) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 8. Worker: Increment Rate Limit Counters (KV Store) │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ A. Increment telemetry_id counter │ │ +│ │ rate:id:{telemetry_id} += 1 (TTL: 3600s) │ │ +│ │ │ │ +│ │ B. Increment IP counter │ │ +│ │ rate:ip:{hashed_ip} += 1 (TTL: 3600s) │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 9. Worker: Return Success │ +│ ┌─────────────────────────────────────────────────────────────┐ │ +│ │ HTTP 200 OK │ │ +│ │ Content-Type: application/json │ │ +│ │ Access-Control-Allow-Origin: * │ │ +│ │ │ │ +│ │ { │ │ +│ │ "status": "ok", │ │ +│ │ "event_id": "abc123..." │ │ +│ │ } │ │ +│ └─────────────────────────────────────────────────────────────┘ │ +└─────────────────────────┬───────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ 10. VoiceMode Client Receives Response │ +│ - 200 OK: Success, event recorded │ +│ - 429 Too Many Requests: Rate limited, retry after X seconds │ +│ - 400 Bad Request: Invalid payload, don't retry │ +│ - 500 Internal Server Error: Server issue, retry with backoff │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +## Data Storage + +### D1 Database (SQLite) + +```sql +events +├── event_id (TEXT PRIMARY KEY) -- Idempotency key +├── telemetry_id (TEXT NOT NULL) -- Anonymous user ID (UUID) +├── timestamp (TEXT NOT NULL) -- Event time (ISO 8601) +├── created_at (TEXT NOT NULL) -- Server receipt time (ISO 8601) +├── environment (TEXT NOT NULL) -- JSON blob +│ └── { +│ "os": "Linux", +│ "version": "1.0.0", +│ "installation_method": "uv", +│ "mcp_host": "claude-code", +│ "execution_source": "mcp" +│ } +├── usage (TEXT NOT NULL) -- JSON blob +│ └── { +│ "total_sessions": 5, +│ "duration_distribution": {...}, +│ "transport_usage": {...}, +│ "provider_usage": {...}, +│ "success_rate": {...}, +│ "error_types": {...} +│ } +└── client_ip_hash (TEXT NOT NULL) -- SHA-256(IP)[0:16] + +Indexes: +- idx_events_telemetry_id (telemetry_id) +- idx_events_timestamp (timestamp) +- idx_events_created_at (created_at) +- idx_events_telemetry_timestamp (telemetry_id, timestamp) +``` + +### KV Store (Key-Value) + +``` +Rate Limiting Keys: +├── rate:id:{telemetry_id} +│ ├── Value: Integer (event count) +│ ├── TTL: 3600 seconds (1 hour) +│ └── Limit: 10 +│ +└── rate:ip:{hashed_ip} + ├── Value: Integer (event count) + ├── TTL: 3600 seconds (1 hour) + └── Limit: 100 +``` + +## Scheduled Tasks + +### Daily Cleanup (Cron: 0 2 * * *) + +``` +┌─────────────────────────────────────────┐ +│ Runs at 2:00 AM UTC daily │ +│ │ +│ DELETE FROM events │ +│ WHERE created_at < datetime('now', │ +│ '-90 days') │ +│ │ +│ Purpose: Prevent unbounded growth │ +│ Retention: 90 days │ +└─────────────────────────────────────────┘ +``` + +## Privacy Protections + +``` +┌──────────────────────────────────────────────────────────────┐ +│ 1. IP Address Hashing │ +│ Input: 192.168.1.100 │ +│ Hash: SHA256(IP) │ +│ Store: ab12cd34ef567890 (first 16 chars) │ +│ Result: Cannot reverse to original IP │ +│ │ +│ 2. Anonymous User ID │ +│ - UUID generated by client │ +│ - No link to user identity │ +│ - Stored in ~/.voicemode/telemetry_id │ +│ │ +│ 3. No PII Collection │ +│ - No names, emails, or user accounts │ +│ - No file paths (anonymized) │ +│ - No error messages with sensitive data │ +│ │ +│ 4. Data Minimization │ +│ - Only collect what's needed for analytics │ +│ - Aggregate counts, not individual actions │ +│ - Binned durations (not precise timestamps) │ +│ │ +│ 5. Automatic Deletion │ +│ - Events deleted after 90 days │ +│ - No long-term storage │ +└──────────────────────────────────────────────────────────────┘ +``` + +## Rate Limiting Strategy + +### Two-Tier Rate Limiting + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Tier 1: Per Telemetry ID (User Device) │ +│ ├─ Limit: 10 events/hour │ +│ ├─ Window: Sliding (TTL-based) │ +│ ├─ Purpose: Prevent single client abuse │ +│ └─ Key: rate:id:{telemetry_id} │ +│ │ +│ Tier 2: Per IP Address │ +│ ├─ Limit: 100 events/hour │ +│ ├─ Window: Sliding (TTL-based) │ +│ ├─ Purpose: Prevent DoS attacks │ +│ └─ Key: rate:ip:{hashed_ip} │ +│ │ +│ Both must pass for request to be accepted │ +└────────────────────────────────────────────────────────────────┘ +``` + +### Example Scenarios + +``` +Scenario 1: Normal Usage +├─ User sends 1 event/day +├─ Rate limit: 0/10 (telemetry_id), 0/100 (IP) +└─ Result: ✓ Accepted + +Scenario 2: Retry Storm +├─ User sends 15 events rapidly (network retry bug) +├─ Rate limit: 10/10 (telemetry_id), 15/100 (IP) +├─ Events 1-10: ✓ Accepted +├─ Events 11-15: ✗ Rejected (429) +└─ Result: Protects backend from client bugs + +Scenario 3: Shared IP (NAT/VPN) +├─ 50 users behind same IP +├─ Each sends 2 events/day +├─ Rate limit: 2/10 per user, 100/100 (IP) +├─ Events 1-100: ✓ Accepted +├─ Events 101+: ✗ Rejected (429) +└─ Result: IP limit protects from shared network abuse + +Scenario 4: Attack +├─ Attacker sends 1000 events from different IPs +├─ Same telemetry_id +├─ Rate limit: 10/10 (telemetry_id) +├─ Events 1-10: ✓ Accepted +├─ Events 11-1000: ✗ Rejected (429) +└─ Result: telemetry_id limit stops attack +``` + +## Error Handling + +``` +┌────────────────────────────────────────────────────────────────┐ +│ HTTP Status Codes │ +│ │ +│ 200 OK │ +│ ├─ Event successfully stored │ +│ ├─ Event already exists (idempotent) │ +│ └─ Response: {status: "ok", event_id: "..."} │ +│ │ +│ 400 Bad Request │ +│ ├─ Invalid JSON payload │ +│ ├─ Missing required fields │ +│ ├─ Invalid field format (UUID, timestamp) │ +│ └─ Response: {error: "descriptive error message"} │ +│ │ +│ 429 Too Many Requests │ +│ ├─ Rate limit exceeded (telemetry_id or IP) │ +│ ├─ Retry-After header set to 3600 seconds │ +│ └─ Response: {error: "...", retry_after: 3600} │ +│ │ +│ 500 Internal Server Error │ +│ ├─ Database error │ +│ ├─ KV namespace error │ +│ ├─ Unexpected exception │ +│ └─ Response: {error: "Internal server error"} │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Performance Characteristics + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Latency (p50/p95/p99) │ +│ ├─ Validation: < 1ms │ +│ ├─ D1 lookup (idempotency): 5-10ms / 15-25ms / 30-50ms │ +│ ├─ KV read (rate limit): 1-3ms / 5-10ms / 15-25ms │ +│ ├─ D1 write: 10-20ms / 30-50ms / 50-100ms │ +│ ├─ KV write: 5-10ms / 15-25ms / 30-50ms │ +│ └─ Total: ~25ms / ~75ms / ~150ms │ +│ │ +│ Throughput │ +│ ├─ Workers: 1000+ requests/second │ +│ ├─ D1: 100+ writes/second │ +│ ├─ KV: 1000+ reads/second, 100+ writes/second │ +│ └─ Bottleneck: D1 writes (sufficient for MVP) │ +│ │ +│ Scalability │ +│ ├─ Workers: Auto-scales to millions of requests │ +│ ├─ D1: Scales to billions of rows │ +│ ├─ KV: Scales to billions of keys │ +│ └─ Cost scales linearly with usage │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Monitoring & Observability + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Cloudflare Dashboard │ +│ ├─ Requests/second (time-series graph) │ +│ ├─ Error rate (4xx, 5xx) │ +│ ├─ CPU time (ms per request) │ +│ ├─ Success rate (%) │ +│ └─ Data egress (bandwidth) │ +│ │ +│ Wrangler Tail (Real-time Logs) │ +│ ├─ console.log() statements │ +│ ├─ Errors and exceptions │ +│ ├─ Request/response details │ +│ └─ Performance metrics │ +│ │ +│ D1 Analytics │ +│ ├─ Query execution time │ +│ ├─ Read/write operations │ +│ ├─ Database size │ +│ └─ Row count │ +│ │ +│ KV Analytics │ +│ ├─ Read/write operations │ +│ ├─ Key count │ +│ ├─ Storage usage │ +│ └─ TTL expirations │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Security Model + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Threat Mitigation │ +│ │ +│ 1. DoS Attack │ +│ ├─ Threat: Overwhelm endpoint with requests │ +│ ├─ Mitigation: IP-based rate limiting (100/hour) │ +│ └─ Additional: Cloudflare DDoS protection │ +│ │ +│ 2. Client Bug (Retry Storm) │ +│ ├─ Threat: Client bug causes infinite retry loop │ +│ ├─ Mitigation: telemetry_id rate limiting (10/hour) │ +│ └─ Additional: Idempotency prevents duplicate storage │ +│ │ +│ 3. Data Poisoning │ +│ ├─ Threat: Submit malicious data to corrupt analytics │ +│ ├─ Mitigation: Strict payload validation │ +│ └─ Additional: JSON schema enforcement │ +│ │ +│ 4. Privacy Breach │ +│ ├─ Threat: Leak user identity or IP address │ +│ ├─ Mitigation: IP hashing, anonymous UUIDs │ +│ └─ Additional: No PII collection │ +│ │ +│ 5. SQL Injection │ +│ ├─ Threat: Inject SQL via payload │ +│ ├─ Mitigation: Prepared statements with binding │ +│ └─ Additional: D1 SQLite parameterized queries │ +│ │ +│ 6. Storage Exhaustion │ +│ ├─ Threat: Fill database with events │ +│ ├─ Mitigation: Rate limiting + automatic cleanup │ +│ └─ Additional: 90-day retention policy │ +└────────────────────────────────────────────────────────────────┘ +``` + +## Cost Model + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Free Tier Limits (per day) │ +│ ├─ Workers: 100,000 requests │ +│ ├─ D1 reads: 5,000,000 rows │ +│ ├─ D1 writes: 100,000 rows │ +│ ├─ KV reads: 100,000 operations │ +│ ├─ KV writes: 1,000 operations │ +│ └─ KV storage: 1 GB │ +│ │ +│ Paid Tier ($5/month + usage) │ +│ ├─ Workers: Unlimited requests │ +│ ├─ D1: Same limits (generous) │ +│ ├─ KV reads: 10,000,000/day (included) │ +│ ├─ KV writes: 1,000,000/day (included) │ +│ ├─ Additional: $0.50 per million requests │ +│ └─ KV storage: $0.50/GB/month │ +│ │ +│ Cost Projection (1000 users, 10 events/day/user) │ +│ ├─ Events: 300,000/month = 10,000/day │ +│ ├─ Worker requests: 10,000/day (within free tier) │ +│ ├─ D1 writes: 10,000/day (within free tier) │ +│ ├─ KV writes: 20,000/day (rate limit counters) │ +│ │ └─ Exceeds free tier! Need paid plan │ +│ ├─ Storage: ~100 MB (within free tier) │ +│ └─ Total: $5/month (paid tier required for KV writes) │ +└────────────────────────────────────────────────────────────────┘ +``` diff --git a/cloudflare-worker/DEPLOYMENT.md b/cloudflare-worker/DEPLOYMENT.md new file mode 100644 index 00000000..bb9dcf07 --- /dev/null +++ b/cloudflare-worker/DEPLOYMENT.md @@ -0,0 +1,361 @@ +# Deployment Checklist for VoiceMode Telemetry Worker + +This checklist guides you through deploying the VoiceMode telemetry endpoint to Cloudflare Workers. + +## Prerequisites + +- [ ] Cloudflare account (free tier is sufficient) +- [ ] Node.js 18+ installed +- [ ] Git (for version control of worker code) + +## One-Time Setup + +### 1. Install Wrangler CLI + +```bash +npm install -g wrangler +``` + +**Verify installation:** +```bash +wrangler --version +``` + +### 2. Authenticate with Cloudflare + +```bash +wrangler login +``` + +This opens a browser window to authorize Wrangler with your Cloudflare account. + +### 3. Get Your Account ID + +1. Go to https://dash.cloudflare.com/ +2. Select your account from the dropdown +3. Copy the **Account ID** from the right sidebar +4. Save this ID - you'll need it in the next step + +### 4. Configure wrangler.toml + +Edit `wrangler.toml` and replace the placeholder: + +```toml +account_id = "YOUR_ACCOUNT_ID_HERE" # Replace with your actual Account ID +``` + +### 5. Create KV Namespace + +```bash +wrangler kv:namespace create "RATE_LIMITS" +``` + +**Expected output:** +``` +Created namespace with title "voicemode-telemetry-RATE_LIMITS" +Add the following to your wrangler.toml: +{ binding = "KV", id = "abc123..." } +``` + +Copy the `id` value and update `wrangler.toml`: + +```toml +[[kv_namespaces]] +binding = "KV" +id = "abc123..." # Replace with the ID from the command output +``` + +### 6. Create D1 Database + +```bash +wrangler d1 create voicemode-telemetry +``` + +**Expected output:** +``` +Created database voicemode-telemetry +database_id = "xyz789..." +``` + +Copy the `database_id` value and update `wrangler.toml`: + +```toml +[[d1_databases]] +binding = "DB" +database_name = "voicemode-telemetry" +database_id = "xyz789..." # Replace with the ID from the command output +``` + +### 7. Initialize Database Schema + +```bash +wrangler d1 execute voicemode-telemetry --file=schema.sql +``` + +**Expected output:** +``` +🌀 Executing on remote database voicemode-telemetry (xyz789...): +🌀 To execute on your local development database, pass the --local flag. +🚣 Executed 5 commands in 0.234ms +``` + +Verify the table was created: + +```bash +wrangler d1 execute voicemode-telemetry --command "SELECT name FROM sqlite_master WHERE type='table'" +``` + +Should show: `events` + +### 8. Deploy the Worker + +```bash +wrangler deploy +``` + +**Expected output:** +``` +Total Upload: XX.XX KiB / gzip: XX.XX KiB +Uploaded voicemode-telemetry (X.XX sec) +Published voicemode-telemetry (X.XX sec) + https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev +``` + +**Save the worker URL!** You'll need it for VoiceMode configuration. + +## Testing Deployment + +### 1. Test with curl + +```bash +export WORKER_URL="https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry" + +curl -X POST $WORKER_URL \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test_'$(date +%s)'", + "telemetry_id": "550e8400-e29b-41d4-a716-446655440000", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": { + "os": "Linux", + "version": "1.0.0", + "installation_method": "uv", + "mcp_host": "claude-code", + "execution_source": "mcp" + }, + "usage": { + "total_sessions": 1, + "duration_distribution": {}, + "transport_usage": {}, + "provider_usage": {} + } + }' +``` + +**Expected response:** +```json +{"status":"ok","event_id":"test_..."} +``` + +### 2. Run Test Suite + +```bash +./test-endpoint.sh $WORKER_URL +``` + +All tests should pass (PASSED in green). + +### 3. Verify Database + +```bash +wrangler d1 execute voicemode-telemetry --command \ + "SELECT COUNT(*) as total_events FROM events" +``` + +Should show the test events you just created. + +### 4. Monitor Worker Logs + +In a separate terminal, run: + +```bash +wrangler tail +``` + +Then send another test request and watch the logs appear in real-time. + +## Configure VoiceMode + +Once deployment is successful, configure VoiceMode to use your telemetry endpoint: + +### Option 1: Via CLI + +```bash +voicemode config set VOICEMODE_TELEMETRY_ENDPOINT https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry +``` + +### Option 2: Manual Edit + +Edit `~/.voicemode/voicemode.env`: + +```bash +# Telemetry configuration +VOICEMODE_TELEMETRY=true +VOICEMODE_TELEMETRY_ENDPOINT=https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry +``` + +## Verify End-to-End + +### 1. Use VoiceMode + +Use VoiceMode normally (have some voice conversations). + +### 2. Check for Telemetry Events + +```bash +# In the cloudflare-worker directory +wrangler d1 execute voicemode-telemetry --command \ + "SELECT event_id, telemetry_id, timestamp FROM events ORDER BY created_at DESC LIMIT 5" +``` + +You should see events from your VoiceMode usage. + +### 3. Query DAU + +```bash +wrangler d1 execute voicemode-telemetry --command \ + "SELECT COUNT(DISTINCT telemetry_id) as dau FROM events WHERE DATE(timestamp) = DATE('now')" +``` + +## Post-Deployment Tasks + +- [ ] Save your worker URL in a secure location (password manager, docs) +- [ ] Document the deployment in your team wiki/docs +- [ ] Set up monitoring alerts (optional - Cloudflare dashboard) +- [ ] Schedule regular checks of telemetry data (weekly/monthly) +- [ ] Add worker URL to VoiceMode production configuration +- [ ] Consider setting up a custom domain (see README.md) + +## Updating the Worker + +When you make changes to `worker.js`: + +```bash +# Test locally first (optional) +wrangler dev + +# Deploy changes +wrangler deploy +``` + +Changes take effect immediately (no downtime). + +## Rollback Procedure + +If something goes wrong after deployment: + +```bash +# View deployment history +wrangler deployments list + +# Rollback to previous version +wrangler rollback +``` + +## Monitoring Checklist + +### Daily (Automated) +- [ ] Cron job runs successfully (check logs at 2am UTC) +- [ ] No error spikes in Cloudflare dashboard + +### Weekly +- [ ] Review error logs: `wrangler tail --format=json | grep error` +- [ ] Check database size: `wrangler d1 execute voicemode-telemetry --command "SELECT COUNT(*) FROM events"` +- [ ] Verify rate limits are working (check for 429 responses) + +### Monthly +- [ ] Export telemetry data for analysis +- [ ] Review costs in Cloudflare billing dashboard +- [ ] Clean up old queued events in VoiceMode clients +- [ ] Update worker dependencies: `npm update wrangler` + +## Troubleshooting + +### Worker returns 500 errors after deployment + +1. Check logs: `wrangler tail` +2. Verify database schema: `wrangler d1 execute voicemode-telemetry --command "PRAGMA table_info(events)"` +3. Check KV binding: `wrangler kv:namespace list` + +### Rate limits not working + +1. Verify KV namespace is bound: Check `wrangler.toml` +2. Test manually: Send 11 requests rapidly to same telemetry_id +3. Check KV keys: `wrangler kv:key list --namespace-id=YOUR_KV_ID` + +### Database queries slow + +1. Check indexes: `wrangler d1 execute voicemode-telemetry --command "SELECT * FROM sqlite_master WHERE type='index'"` +2. Vacuum database: `wrangler d1 execute voicemode-telemetry --command "VACUUM"` +3. Consider archiving old data + +## Security Notes + +- **Never commit `.env` files** (already in `.gitignore`) +- **Rotate secrets annually** (KV namespace IDs, database IDs) +- **Monitor for abuse** (check for unusual traffic patterns) +- **Keep wrangler.toml private** if it contains sensitive IDs + +## Cost Management + +### Monitor Costs + +1. Go to: https://dash.cloudflare.com/ → Account Home → Billing +2. Review Workers usage under "Workers Paid" or "Workers Free" +3. Set up billing alerts (optional) + +### Stay Within Free Tier + +- **Workers:** 100,000 requests/day +- **KV:** 1,000 writes/day (may need paid tier) +- **D1:** 100,000 writes/day + +If approaching limits, consider: +- Reducing telemetry frequency in VoiceMode +- Upgrading to Workers Paid ($5/month) +- Implementing client-side sampling (send 10% of events) + +## Success Criteria + +✅ Deployment is successful when: + +1. Worker URL returns 200 OK for valid requests +2. Test suite passes all 6 tests +3. Events appear in D1 database +4. Rate limiting triggers on 11th request +5. VoiceMode clients can send telemetry +6. No errors in `wrangler tail` logs + +## Support + +- **Cloudflare Discord:** https://discord.gg/cloudflaredev +- **Workers Documentation:** https://developers.cloudflare.com/workers/ +- **D1 Documentation:** https://developers.cloudflare.com/d1/ +- **VoiceMode Issues:** https://github.com/mbailey/voicemode/issues + +## Next Steps After Deployment + +1. **Analytics:** Set up regular queries to understand usage patterns +2. **Dashboards:** Build Grafana/Metabase dashboards for visualization +3. **Alerts:** Configure alerts for error rate spikes +4. **Optimization:** Monitor performance and optimize queries +5. **Documentation:** Update VoiceMode docs with telemetry information + +--- + +**Deployment Date:** _______________ +**Deployed By:** _______________ +**Worker URL:** _______________ +**Account ID:** _______________ +**Database ID:** _______________ +**KV Namespace ID:** _______________ diff --git a/cloudflare-worker/QUICK_START.md b/cloudflare-worker/QUICK_START.md new file mode 100644 index 00000000..67247e74 --- /dev/null +++ b/cloudflare-worker/QUICK_START.md @@ -0,0 +1,218 @@ +# Quick Start: Deploy VoiceMode Telemetry Endpoint + +This is a condensed guide to get your telemetry endpoint deployed quickly. For detailed information, see [DEPLOYMENT.md](DEPLOYMENT.md) and [README.md](README.md). + +## TL;DR - 5 Minute Setup + +```bash +# 1. Install Wrangler +npm install -g wrangler + +# 2. Login to Cloudflare +wrangler login + +# 3. Get your Account ID from https://dash.cloudflare.com/ and edit wrangler.toml + +# 4. Create resources +wrangler kv:namespace create "RATE_LIMITS" +# Copy the ID and update wrangler.toml [[kv_namespaces]] section + +wrangler d1 create voicemode-telemetry +# Copy the database_id and update wrangler.toml [[d1_databases]] section + +# 5. Initialize database +wrangler d1 execute voicemode-telemetry --file=schema.sql + +# 6. Deploy +wrangler deploy + +# 7. Test +./test-endpoint.sh https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry + +# 8. Configure VoiceMode +voicemode config set VOICEMODE_TELEMETRY_ENDPOINT https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry +``` + +## What You Get + +- **Endpoint:** `https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry` +- **Rate Limiting:** 10 events/hour per user, 100 events/hour per IP +- **Privacy:** IP addresses hashed, no PII collected +- **Cost:** Free tier covers < 1000 users (~$0/month) +- **Idempotency:** Duplicate events handled gracefully +- **Automatic Cleanup:** Events older than 90 days removed daily + +## Files Created + +``` +cloudflare-worker/ +├── worker.js # Main Worker code (370 lines) +├── wrangler.toml # Cloudflare configuration +├── schema.sql # D1 database schema +├── README.md # Comprehensive documentation +├── DEPLOYMENT.md # Step-by-step deployment guide +├── test-endpoint.sh # Automated test suite +├── package.json # npm scripts +├── .gitignore # Git ignore rules +├── .env.example # Environment template +└── QUICK_START.md # This file +``` + +## Required Configuration + +Edit `wrangler.toml` with your values: + +```toml +account_id = "YOUR_ACCOUNT_ID_HERE" # From Cloudflare dashboard + +[[kv_namespaces]] +binding = "KV" +id = "YOUR_KV_NAMESPACE_ID_HERE" # From: wrangler kv:namespace create + +[[d1_databases]] +binding = "DB" +database_name = "voicemode-telemetry" +database_id = "YOUR_D1_DATABASE_ID_HERE" # From: wrangler d1 create +``` + +## Verify Deployment + +### 1. Test Endpoint + +```bash +curl -X POST https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test123", + "telemetry_id": "550e8400-e29b-41d4-a716-446655440000", + "timestamp": "2024-12-14T10:00:00Z", + "environment": {"os": "Linux", "version": "1.0.0"}, + "usage": {"total_sessions": 1} + }' +``` + +Expected: `{"status":"ok","event_id":"test123"}` + +### 2. Check Database + +```bash +wrangler d1 execute voicemode-telemetry --command \ + "SELECT COUNT(*) FROM events" +``` + +Expected: `1` (from test above) + +### 3. Run Full Test Suite + +```bash +./test-endpoint.sh https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry +``` + +All tests should show `✓ PASSED` in green. + +## Usage in VoiceMode + +After deployment, configure VoiceMode: + +```bash +# Option 1: Via CLI +voicemode config set VOICEMODE_TELEMETRY_ENDPOINT https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry + +# Option 2: Edit ~/.voicemode/voicemode.env +echo "VOICEMODE_TELEMETRY_ENDPOINT=https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry" >> ~/.voicemode/voicemode.env +``` + +Telemetry will start flowing automatically (if user has opted in). + +## Monitoring + +### View Live Logs + +```bash +wrangler tail +``` + +### Query Telemetry Data + +```bash +# Daily Active Users +wrangler d1 execute voicemode-telemetry --command \ + "SELECT COUNT(DISTINCT telemetry_id) as dau FROM events WHERE DATE(timestamp) = DATE('now')" + +# Total events +wrangler d1 execute voicemode-telemetry --command \ + "SELECT COUNT(*) as total FROM events" + +# Recent events +wrangler d1 execute voicemode-telemetry --command \ + "SELECT * FROM events ORDER BY created_at DESC LIMIT 5" +``` + +### Cloudflare Dashboard + +View metrics at: https://dash.cloudflare.com/ → Workers & Pages → voicemode-telemetry + +## Cost Estimate + +| Users | Events/Month | Cost | +|-------|--------------|------| +| < 100 | ~30,000 | $0 (free tier) | +| 100-1000 | ~300,000 | $0-5 | +| 1000-10000 | ~3,000,000 | $5-15 | + +Free tier limits: +- Workers: 100,000 requests/day +- D1: 100,000 writes/day, 5 million reads/day +- KV: 1,000 writes/day (may hit limit - upgrade to $5/month plan) + +## Troubleshooting + +### Worker returns 500 + +```bash +# Check logs +wrangler tail + +# Verify schema +wrangler d1 execute voicemode-telemetry --command "PRAGMA table_info(events)" +``` + +### Rate limit not working + +```bash +# Check KV keys +wrangler kv:key list --namespace-id=YOUR_KV_NAMESPACE_ID +``` + +### Need to reset everything + +```bash +# Delete database (WARNING: destroys all data) +wrangler d1 delete voicemode-telemetry + +# Delete KV namespace +wrangler kv:namespace delete --namespace-id=YOUR_KV_NAMESPACE_ID + +# Then recreate from step 4 above +``` + +## Next Steps + +1. ✅ Deploy worker (you just did this!) +2. ✅ Test endpoint +3. ✅ Configure VoiceMode +4. 📊 Set up analytics queries (see README.md) +5. 📈 Monitor usage in Cloudflare dashboard +6. 🔔 Set up alerts (optional) +7. 📝 Document your worker URL for team + +## Support + +- **Detailed Docs:** [README.md](README.md) +- **Deployment Guide:** [DEPLOYMENT.md](DEPLOYMENT.md) +- **Cloudflare Docs:** https://developers.cloudflare.com/workers/ +- **VoiceMode Issues:** https://github.com/mbailey/voicemode/issues + +--- + +**Deployment complete!** Your telemetry endpoint is now ready to collect privacy-preserving usage data from VoiceMode clients. diff --git a/cloudflare-worker/README.md b/cloudflare-worker/README.md new file mode 100644 index 00000000..67eef64e --- /dev/null +++ b/cloudflare-worker/README.md @@ -0,0 +1,517 @@ +# VoiceMode Telemetry Cloudflare Worker + +This directory contains the Cloudflare Worker implementation for VoiceMode's telemetry endpoint. The worker receives, validates, rate-limits, and stores telemetry events from VoiceMode clients. + +## Features + +- **Payload Validation**: Strict JSON schema validation for all incoming events +- **Rate Limiting**: + - Per anonymous ID: 10 events/hour + - Per IP address: 100 events/hour +- **Idempotency**: Duplicate event_id submissions return success without re-storing +- **Privacy**: IP addresses are SHA-256 hashed before storage +- **Automatic Cleanup**: Daily cron job removes events older than 90 days +- **CORS Support**: Allows cross-origin requests from VoiceMode clients + +## Architecture + +``` +┌─────────────────┐ +│ VoiceMode Client│ +│ (Python) │ +└────────┬────────┘ + │ POST /telemetry + │ {event_id, telemetry_id, timestamp, environment, usage} + ▼ +┌─────────────────────────────────┐ +│ Cloudflare Worker │ +│ ├─ Validate payload │ +│ ├─ Check idempotency (D1) │ +│ ├─ Check rate limits (KV) │ +│ ├─ Store event (D1) │ +│ └─ Increment rate counters (KV)│ +└─────────────────────────────────┘ + │ + ├────────────┬────────────┐ + ▼ ▼ ▼ + ┌──────┐ ┌──────┐ ┌──────────┐ + │ D1 │ │ KV │ │Analytics │ + │ (DB) │ │(Rate)│ │ (Future) │ + └──────┘ └──────┘ └──────────┘ +``` + +## Prerequisites + +1. **Cloudflare Account** + - Sign up at https://dash.cloudflare.com/sign-up + - Free tier is sufficient for MVP (< 1000 users) + +2. **Node.js and npm** + - Install from https://nodejs.org/ (v18+ recommended) + +3. **Wrangler CLI** + - Install: `npm install -g wrangler` + - Documentation: https://developers.cloudflare.com/workers/wrangler/ + +## Setup Instructions + +### 1. Install Wrangler + +```bash +npm install -g wrangler +``` + +### 2. Authenticate with Cloudflare + +```bash +wrangler login +``` + +This will open a browser window to authorize Wrangler with your Cloudflare account. + +### 3. Get Your Account ID + +1. Go to https://dash.cloudflare.com/ +2. Select your account +3. Copy the **Account ID** from the right sidebar +4. Edit `wrangler.toml` and replace `YOUR_ACCOUNT_ID_HERE` with your Account ID + +### 4. Create KV Namespace (for rate limiting) + +```bash +wrangler kv:namespace create "RATE_LIMITS" +``` + +Copy the `id` from the output and update `wrangler.toml`: + +```toml +[[kv_namespaces]] +binding = "KV" +id = "your_kv_namespace_id_here" # Replace with actual ID +``` + +### 5. Create D1 Database (for telemetry events) + +```bash +wrangler d1 create voicemode-telemetry +``` + +Copy the `database_id` from the output and update `wrangler.toml`: + +```toml +[[d1_databases]] +binding = "DB" +database_name = "voicemode-telemetry" +database_id = "your_d1_database_id_here" # Replace with actual ID +``` + +### 6. Initialize Database Schema + +```bash +wrangler d1 execute voicemode-telemetry --file=schema.sql +``` + +This creates the `events` table and necessary indexes. + +### 7. Deploy the Worker + +```bash +wrangler deploy +``` + +The command will output your worker URL, something like: +``` +https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev +``` + +**Save this URL** - you'll need it for the VoiceMode configuration. + +## Configuration in VoiceMode + +Once deployed, configure VoiceMode to use your telemetry endpoint: + +```bash +# Set the telemetry endpoint URL +voicemode config set VOICEMODE_TELEMETRY_ENDPOINT https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry + +# Enable telemetry (user must opt-in) +voicemode config set VOICEMODE_TELEMETRY true +``` + +Or add to `~/.voicemode/voicemode.env`: + +```bash +VOICEMODE_TELEMETRY=true +VOICEMODE_TELEMETRY_ENDPOINT=https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry +``` + +## Testing the Endpoint + +### Manual Test with curl + +```bash +curl -X POST https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test123", + "telemetry_id": "550e8400-e29b-41d4-a716-446655440000", + "timestamp": "2024-12-14T10:30:00Z", + "environment": { + "os": "Linux", + "version": "1.0.0", + "installation_method": "uv", + "mcp_host": "claude-code", + "execution_source": "mcp" + }, + "usage": { + "total_sessions": 5, + "duration_distribution": {"1-5min": 3, "5-10min": 2}, + "transport_usage": {"local": 4, "livekit": 1}, + "provider_usage": { + "tts": {"openai": 3, "kokoro": 2}, + "stt": {"whisper-local": 5} + } + } + }' +``` + +Expected response: +```json +{ + "status": "ok", + "event_id": "test123" +} +``` + +### Test Idempotency + +Send the same request again - should return: +```json +{ + "status": "ok", + "message": "Event already recorded" +} +``` + +### Test Rate Limiting + +Send 11 requests with the same `telemetry_id` within an hour: +```json +{ + "error": "Rate limit exceeded for telemetry ID", + "retry_after": 3600 +} +``` + +### Test Invalid Payload + +```bash +curl -X POST https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry \ + -H "Content-Type: application/json" \ + -d '{"invalid": "data"}' +``` + +Expected response: +```json +{ + "error": "Invalid or missing event_id" +} +``` + +## Monitoring and Debugging + +### View Worker Logs + +```bash +wrangler tail +``` + +This streams real-time logs from your worker. Keep this running while testing. + +### Query D1 Database + +```bash +# Count total events +wrangler d1 execute voicemode-telemetry --command "SELECT COUNT(*) as total FROM events" + +# View recent events +wrangler d1 execute voicemode-telemetry --command "SELECT event_id, telemetry_id, timestamp FROM events ORDER BY created_at DESC LIMIT 10" + +# Daily active users +wrangler d1 execute voicemode-telemetry --command "SELECT COUNT(DISTINCT telemetry_id) as dau FROM events WHERE DATE(timestamp) = DATE('now')" +``` + +### Check KV Namespace (rate limits) + +```bash +# List all keys +wrangler kv:key list --namespace-id=YOUR_KV_NAMESPACE_ID + +# Get a specific rate limit counter +wrangler kv:key get "rate:id:550e8400-e29b-41d4-a716-446655440000" --namespace-id=YOUR_KV_NAMESPACE_ID +``` + +### View Worker Metrics + +Go to: https://dash.cloudflare.com/ → Workers & Pages → voicemode-telemetry → Metrics + +Metrics include: +- Requests per second +- Error rate +- CPU time usage +- Success/error status codes + +## Querying Telemetry Data + +### Example SQL Queries + +#### Daily Active Users (DAU) + +```sql +SELECT COUNT(DISTINCT telemetry_id) as dau +FROM events +WHERE DATE(timestamp) = DATE('now'); +``` + +#### Weekly Active Users (WAU) + +```sql +SELECT COUNT(DISTINCT telemetry_id) as wau +FROM events +WHERE timestamp >= datetime('now', '-7 days'); +``` + +#### Sessions by Operating System + +```sql +SELECT + json_extract(environment, '$.os') as os, + SUM(json_extract(usage, '$.total_sessions')) as total_sessions +FROM events +GROUP BY os +ORDER BY total_sessions DESC; +``` + +#### TTS Provider Usage + +```sql +SELECT + DATE(timestamp) as date, + json_extract(usage, '$.provider_usage.tts') as tts_providers +FROM events +WHERE timestamp >= datetime('now', '-30 days') +ORDER BY date DESC; +``` + +#### 7-Day Retention + +```sql +WITH first_seen AS ( + SELECT telemetry_id, MIN(DATE(timestamp)) as first_date + FROM events + GROUP BY telemetry_id +) +SELECT + COUNT(DISTINCT e.telemetry_id) as retained_users, + COUNT(DISTINCT f.telemetry_id) as cohort_size, + ROUND(100.0 * COUNT(DISTINCT e.telemetry_id) / COUNT(DISTINCT f.telemetry_id), 2) as retention_pct +FROM first_seen f +LEFT JOIN events e ON e.telemetry_id = f.telemetry_id + AND DATE(e.timestamp) = DATE(f.first_date, '+7 days') +WHERE f.first_date = DATE('now', '-7 days'); +``` + +#### Average Sessions Per User + +```sql +SELECT + AVG(total) as avg_sessions_per_user +FROM ( + SELECT + telemetry_id, + SUM(json_extract(usage, '$.total_sessions')) as total + FROM events + GROUP BY telemetry_id +); +``` + +### Run Queries via Wrangler + +```bash +wrangler d1 execute voicemode-telemetry --command "YOUR_SQL_QUERY_HERE" +``` + +### Export Data + +```bash +# Export to JSON +wrangler d1 export voicemode-telemetry --output=telemetry-export.sql + +# Query and save to file +wrangler d1 execute voicemode-telemetry \ + --command "SELECT * FROM events WHERE timestamp >= datetime('now', '-7 days')" \ + --json > last-7-days.json +``` + +## Cost Estimation + +### Free Tier (Sufficient for MVP) + +**Workers:** +- 100,000 requests/day +- 10ms CPU time per request +- 128MB memory + +**KV:** +- 100,000 reads/day +- 1,000 writes/day +- 1GB storage + +**D1:** +- 5 million row reads/day +- 100,000 row writes/day +- 5GB storage + +### Expected Usage (1,000 users, 10 events/day) + +- Requests: ~10,000/day (well within 100k limit) +- KV writes: ~20,000/day (rate limit counters) - **exceeds free tier** +- D1 writes: ~10,000/day (within limit) +- Storage: ~100MB for 90 days of events (within limit) + +**Recommendation:** Start with free tier. If KV writes exceed limits, upgrade to Workers Paid ($5/month base) which includes unlimited requests and higher KV limits. + +### Paid Tier (if needed) + +**Workers Paid ($5/month + usage):** +- Unlimited requests +- 50ms CPU time per request +- 10 million KV reads/day (included) +- 1 million KV writes/day (included) +- Additional requests: $0.50 per million + +At 1,000 users: Approximately **$5-8/month** + +## Maintenance + +### Daily Automatic Cleanup + +The worker includes a scheduled task (cron) that runs daily at 2am UTC to delete events older than 90 days. This prevents unbounded database growth. + +View cron trigger status: +```bash +wrangler deployments list +``` + +### Manual Cleanup + +```bash +# Delete events older than 90 days +wrangler d1 execute voicemode-telemetry --command \ + "DELETE FROM events WHERE created_at < datetime('now', '-90 days')" + +# Vacuum database to reclaim space +wrangler d1 execute voicemode-telemetry --command "VACUUM" +``` + +### Update Worker Code + +After making changes to `worker.js`: + +```bash +wrangler deploy +``` + +No downtime - Cloudflare deploys atomically. + +## Troubleshooting + +### "Error: No account_id found" + +Edit `wrangler.toml` and add your Account ID from the Cloudflare dashboard. + +### "Error: KV namespace not found" + +Run `wrangler kv:namespace create "RATE_LIMITS"` and update the `id` in `wrangler.toml`. + +### "Error: D1 database not found" + +Run `wrangler d1 create voicemode-telemetry` and update the `database_id` in `wrangler.toml`. + +### "Error: table events does not exist" + +Initialize the database schema: +```bash +wrangler d1 execute voicemode-telemetry --file=schema.sql +``` + +### Worker returns 500 errors + +Check logs: +```bash +wrangler tail +``` + +Common issues: +- Database not initialized (run schema.sql) +- KV namespace not bound (check wrangler.toml) +- JSON parsing error (check request payload) + +### Rate limits triggering unexpectedly + +Check KV values: +```bash +wrangler kv:key list --namespace-id=YOUR_KV_NAMESPACE_ID +``` + +Reset a rate limit manually: +```bash +wrangler kv:key delete "rate:id:TELEMETRY_ID" --namespace-id=YOUR_KV_NAMESPACE_ID +``` + +## Security Considerations + +1. **IP Anonymization**: IP addresses are SHA-256 hashed before storage +2. **No PII**: Only anonymous UUIDs are collected, no user names or emails +3. **Rate Limiting**: Prevents abuse and resource exhaustion +4. **CORS**: Configured to accept requests from any origin (required for CLI tool) +5. **Payload Validation**: Strict schema validation prevents malformed data +6. **Data Retention**: Automatic cleanup after 90 days + +## Custom Domain (Optional) + +To use a custom domain instead of `workers.dev`: + +1. Add a domain to Cloudflare +2. Add a route in `wrangler.toml`: + +```toml +routes = [ + { pattern = "telemetry.yourdomain.com", custom_domain = true } +] +``` + +3. Deploy: + +```bash +wrangler deploy +``` + +## Next Steps + +1. **Deploy the worker** following the setup instructions +2. **Test with curl** to verify it's working +3. **Configure VoiceMode** with the endpoint URL +4. **Monitor metrics** in Cloudflare dashboard +5. **Query data** to analyze usage patterns + +## Support + +- Cloudflare Workers Docs: https://developers.cloudflare.com/workers/ +- D1 Database Docs: https://developers.cloudflare.com/d1/ +- Wrangler CLI Docs: https://developers.cloudflare.com/workers/wrangler/ +- VoiceMode Issues: https://github.com/mbailey/voicemode/issues + +## License + +This code is part of the VoiceMode project and shares the same license. diff --git a/cloudflare-worker/package.json b/cloudflare-worker/package.json new file mode 100644 index 00000000..849cd3d9 --- /dev/null +++ b/cloudflare-worker/package.json @@ -0,0 +1,23 @@ +{ + "name": "voicemode-telemetry-worker", + "version": "1.0.0", + "description": "Cloudflare Worker for VoiceMode telemetry collection", + "main": "worker.js", + "scripts": { + "deploy": "wrangler deploy", + "dev": "wrangler dev", + "tail": "wrangler tail", + "test": "./test-endpoint.sh" + }, + "keywords": [ + "cloudflare", + "workers", + "telemetry", + "voicemode" + ], + "author": "VoiceMode Contributors", + "license": "MIT", + "devDependencies": { + "wrangler": "^3.0.0" + } +} diff --git a/cloudflare-worker/schema.sql b/cloudflare-worker/schema.sql new file mode 100644 index 00000000..905fd1a2 --- /dev/null +++ b/cloudflare-worker/schema.sql @@ -0,0 +1,94 @@ +-- VoiceMode Telemetry D1 Database Schema +-- +-- This schema stores telemetry events with privacy protections: +-- - IP addresses are hashed before storage +-- - Event data is stored as JSON for flexibility +-- - Automatic cleanup of old events via scheduled worker + +-- Main events table +CREATE TABLE IF NOT EXISTS events ( + -- Event identification + event_id TEXT PRIMARY KEY, -- SHA-256 hash from client (telemetry_id + timestamp) + telemetry_id TEXT NOT NULL, -- Anonymous UUID from client device + + -- Timestamps + timestamp TEXT NOT NULL, -- Event timestamp (ISO 8601 from client) + created_at TEXT NOT NULL, -- Server receipt time (ISO 8601) + + -- Event data (stored as JSON for flexibility) + environment TEXT NOT NULL, -- JSON: {os, version, installation_method, mcp_host, execution_source} + usage TEXT NOT NULL, -- JSON: {total_sessions, duration_distribution, provider_usage, etc.} + + -- Privacy-preserving metadata + client_ip_hash TEXT NOT NULL -- SHA-256 hash of client IP (for rate limiting only) +); + +-- Indexes for efficient queries + +-- Index for querying events by telemetry_id (user retention analysis) +CREATE INDEX IF NOT EXISTS idx_events_telemetry_id +ON events(telemetry_id); + +-- Index for querying events by timestamp (DAU, time-series analysis) +CREATE INDEX IF NOT EXISTS idx_events_timestamp +ON events(timestamp); + +-- Index for cleanup queries (finding old events) +CREATE INDEX IF NOT EXISTS idx_events_created_at +ON events(created_at); + +-- Composite index for user activity over time +CREATE INDEX IF NOT EXISTS idx_events_telemetry_timestamp +ON events(telemetry_id, timestamp); + +-- Notes on schema design: +-- +-- 1. Event ID is the primary key to enforce idempotency - duplicate +-- submissions with the same event_id will be rejected by the database. +-- +-- 2. We store environment and usage as JSON TEXT rather than normalized +-- tables because: +-- - The schema may evolve over time +-- - D1 supports JSON functions for querying (json_extract) +-- - Simplifies inserts and reduces table complexity +-- - VoiceMode is early stage - premature optimization avoided +-- +-- 3. IP addresses are hashed before storage and only used for rate limiting. +-- We cannot reverse the hash to identify users. +-- +-- 4. The timestamp column uses the client-provided timestamp (privacy: no +-- server timezone leak), while created_at uses server time for ordering +-- and cleanup. +-- +-- 5. No foreign keys or complex constraints - keep it simple for MVP. +-- +-- Example queries: +-- +-- Daily Active Users (DAU): +-- SELECT COUNT(DISTINCT telemetry_id) as dau +-- FROM events +-- WHERE DATE(timestamp) = DATE('now'); +-- +-- Sessions by OS: +-- SELECT json_extract(environment, '$.os') as os, +-- SUM(json_extract(usage, '$.total_sessions')) as sessions +-- FROM events +-- GROUP BY os; +-- +-- Provider usage over time: +-- SELECT DATE(timestamp) as date, +-- json_extract(usage, '$.provider_usage.tts') as tts_providers +-- FROM events +-- ORDER BY date DESC; +-- +-- Retention (7-day): +-- WITH first_seen AS ( +-- SELECT telemetry_id, MIN(DATE(timestamp)) as first_date +-- FROM events +-- GROUP BY telemetry_id +-- ) +-- SELECT +-- COUNT(DISTINCT e.telemetry_id) as retained_users +-- FROM events e +-- JOIN first_seen f ON e.telemetry_id = f.telemetry_id +-- WHERE DATE(e.timestamp) = DATE(f.first_date, '+7 days'); diff --git a/cloudflare-worker/sync-telemetry-local.sh b/cloudflare-worker/sync-telemetry-local.sh new file mode 100755 index 00000000..e78bf4b8 --- /dev/null +++ b/cloudflare-worker/sync-telemetry-local.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Sync VoiceMode telemetry from Cloudflare D1 to local SQLite database +# +# Usage: ./sync-telemetry-local.sh [output-path] +# +# Default output: ~/.voicemode/telemetry/telemetry.db + +set -e + +# Configuration +D1_DATABASE="voicemode-telemetry" +DEFAULT_OUTPUT_DIR="${HOME}/.voicemode/telemetry" +DEFAULT_OUTPUT_FILE="${DEFAULT_OUTPUT_DIR}/telemetry.db" + +# Use custom path if provided, otherwise use default +OUTPUT_FILE="${1:-$DEFAULT_OUTPUT_FILE}" +OUTPUT_DIR="$(dirname "$OUTPUT_FILE")" + +# Create output directory if needed +mkdir -p "$OUTPUT_DIR" + +# Temporary file for SQL export +TEMP_SQL=$(mktemp) +trap "rm -f $TEMP_SQL" EXIT + +echo "🔄 Syncing telemetry from Cloudflare D1..." +echo " Database: $D1_DATABASE" +echo " Output: $OUTPUT_FILE" + +# Check if wrangler is installed +if ! command -v wrangler &> /dev/null; then + echo "❌ Error: wrangler CLI not found" + echo " Install with: npm install -g wrangler" + exit 1 +fi + +# Export from D1 +echo "📥 Exporting from D1..." +wrangler d1 export "$D1_DATABASE" --remote --output="$TEMP_SQL" 2>&1 | grep -v "^⛅️\|^─\|^$" + +# Check if export succeeded +if [ ! -s "$TEMP_SQL" ]; then + echo "❌ Error: Export failed or produced empty file" + exit 1 +fi + +# Import into local SQLite +echo "📦 Creating local SQLite database..." +rm -f "$OUTPUT_FILE" +sqlite3 "$OUTPUT_FILE" < "$TEMP_SQL" + +# Verify +EVENT_COUNT=$(sqlite3 "$OUTPUT_FILE" "SELECT COUNT(*) FROM events;") +echo "✅ Sync complete!" +echo " Events synced: $EVENT_COUNT" +echo " Database: $OUTPUT_FILE" + +# Show latest event +echo "" +echo "📊 Latest event:" +sqlite3 "$OUTPUT_FILE" "SELECT datetime(created_at) as synced, telemetry_id, json_extract(usage, '$.total_sessions') as sessions FROM events ORDER BY created_at DESC LIMIT 1;" diff --git a/cloudflare-worker/test-endpoint.sh b/cloudflare-worker/test-endpoint.sh new file mode 100755 index 00000000..8eabb676 --- /dev/null +++ b/cloudflare-worker/test-endpoint.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# +# Test script for VoiceMode telemetry endpoint +# +# Usage: +# ./test-endpoint.sh https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Endpoint URL (first argument or default) +ENDPOINT="${1:-https://voicemode-telemetry.YOUR_SUBDOMAIN.workers.dev/telemetry}" + +if [[ "$ENDPOINT" == *"YOUR_SUBDOMAIN"* ]]; then + echo -e "${RED}Error: Please provide your worker URL${NC}" + echo "Usage: $0 " + echo "Example: $0 https://voicemode-telemetry.my-account.workers.dev/telemetry" + exit 1 +fi + +echo -e "${YELLOW}Testing VoiceMode Telemetry Endpoint${NC}" +echo "Endpoint: $ENDPOINT" +echo "" + +# Test 1: Valid payload +echo -e "${YELLOW}Test 1: Valid payload${NC}" +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test_'$(date +%s)'", + "telemetry_id": "550e8400-e29b-41d4-a716-446655440000", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": { + "os": "Linux", + "version": "1.0.0", + "installation_method": "uv", + "mcp_host": "claude-code", + "execution_source": "mcp" + }, + "usage": { + "total_sessions": 5, + "duration_distribution": {"1-5min": 3, "5-10min": 2}, + "transport_usage": {"local": 4, "livekit": 1}, + "provider_usage": { + "tts": {"openai": 3, "kokoro": 2}, + "stt": {"whisper-local": 5} + } + } + }') + +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -eq 200 ]; then + echo -e "${GREEN}✓ PASSED${NC} - Status: $HTTP_CODE" + echo "Response: $BODY" +else + echo -e "${RED}✗ FAILED${NC} - Expected 200, got $HTTP_CODE" + echo "Response: $BODY" +fi +echo "" + +# Test 2: Idempotency (same event_id) +echo -e "${YELLOW}Test 2: Idempotency (duplicate event_id)${NC}" +EVENT_ID="idempotency_test_$(date +%s)" +TELEMETRY_ID="650e8400-e29b-41d4-a716-446655440001" + +# First request +curl -s -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "'$EVENT_ID'", + "telemetry_id": "'$TELEMETRY_ID'", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": { + "os": "Linux", + "version": "1.0.0", + "installation_method": "dev", + "mcp_host": "claude-code", + "execution_source": "mcp" + }, + "usage": {"total_sessions": 1} + }' > /dev/null + +# Second request (duplicate) +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "'$EVENT_ID'", + "telemetry_id": "'$TELEMETRY_ID'", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": { + "os": "Linux", + "version": "1.0.0", + "installation_method": "dev", + "mcp_host": "claude-code", + "execution_source": "mcp" + }, + "usage": {"total_sessions": 1} + }') + +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -eq 200 ] && echo "$BODY" | grep -q "already recorded"; then + echo -e "${GREEN}✓ PASSED${NC} - Status: $HTTP_CODE (idempotent)" + echo "Response: $BODY" +else + echo -e "${RED}✗ FAILED${NC} - Expected idempotent response" + echo "Response: $BODY" +fi +echo "" + +# Test 3: Missing required field +echo -e "${YELLOW}Test 3: Invalid payload (missing telemetry_id)${NC}" +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test_invalid", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": {"os": "Linux", "version": "1.0.0"}, + "usage": {"total_sessions": 1} + }') + +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -eq 400 ]; then + echo -e "${GREEN}✓ PASSED${NC} - Status: $HTTP_CODE" + echo "Response: $BODY" +else + echo -e "${RED}✗ FAILED${NC} - Expected 400, got $HTTP_CODE" + echo "Response: $BODY" +fi +echo "" + +# Test 4: Invalid UUID format +echo -e "${YELLOW}Test 4: Invalid telemetry_id format${NC}" +RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "test_invalid_uuid", + "telemetry_id": "not-a-valid-uuid", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": {"os": "Linux", "version": "1.0.0"}, + "usage": {"total_sessions": 1} + }') + +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) +BODY=$(echo "$RESPONSE" | sed '$d') + +if [ "$HTTP_CODE" -eq 400 ] && echo "$BODY" | grep -q "UUID"; then + echo -e "${GREEN}✓ PASSED${NC} - Status: $HTTP_CODE" + echo "Response: $BODY" +else + echo -e "${RED}✗ FAILED${NC} - Expected 400 with UUID error" + echo "Response: $BODY" +fi +echo "" + +# Test 5: Rate limiting (requires multiple rapid requests) +echo -e "${YELLOW}Test 5: Rate limiting (11 requests in rapid succession)${NC}" +RATE_LIMIT_ID="750e8400-e29b-41d4-a716-446655440002" +RATE_LIMITED=false + +for i in {1..11}; do + RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "$ENDPOINT" \ + -H "Content-Type: application/json" \ + -d '{ + "event_id": "rate_test_'$i'_'$(date +%s)'", + "telemetry_id": "'$RATE_LIMIT_ID'", + "timestamp": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'", + "environment": {"os": "Linux", "version": "1.0.0"}, + "usage": {"total_sessions": 1} + }') + + HTTP_CODE=$(echo "$RESPONSE" | tail -n1) + + if [ "$HTTP_CODE" -eq 429 ]; then + RATE_LIMITED=true + BODY=$(echo "$RESPONSE" | sed '$d') + echo -e "${GREEN}✓ PASSED${NC} - Rate limit triggered at request $i" + echo "Response: $BODY" + break + fi +done + +if [ "$RATE_LIMITED" = false ]; then + echo -e "${YELLOW}⚠ WARNING${NC} - Rate limit not triggered (may need to wait or adjust limits)" +fi +echo "" + +# Test 6: CORS preflight +echo -e "${YELLOW}Test 6: CORS preflight (OPTIONS request)${NC}" +RESPONSE=$(curl -s -w "\n%{http_code}" -X OPTIONS "$ENDPOINT" \ + -H "Origin: http://example.com" \ + -H "Access-Control-Request-Method: POST") + +HTTP_CODE=$(echo "$RESPONSE" | tail -n1) + +if [ "$HTTP_CODE" -eq 204 ]; then + echo -e "${GREEN}✓ PASSED${NC} - Status: $HTTP_CODE" +else + echo -e "${RED}✗ FAILED${NC} - Expected 204, got $HTTP_CODE" +fi +echo "" + +echo -e "${GREEN}Test suite complete!${NC}" diff --git a/cloudflare-worker/worker.js b/cloudflare-worker/worker.js new file mode 100644 index 00000000..07a4bdae --- /dev/null +++ b/cloudflare-worker/worker.js @@ -0,0 +1,354 @@ +/** + * VoiceMode Telemetry Endpoint + * + * Cloudflare Worker that receives telemetry events from VoiceMode clients. + * + * Features: + * - Payload validation + * - Rate limiting per anonymous ID (10 events/hour) + * - Rate limiting per IP (100 events/hour) + * - Idempotency via event_id + * - Storage in D1 database + * - CORS support + */ + +// Rate limit windows (in seconds) +const RATE_LIMIT_WINDOW = 3600; // 1 hour +const ID_RATE_LIMIT = 10; // max events per telemetry_id per hour +const IP_RATE_LIMIT = 100; // max events per IP per hour + +// Event retention period +const EVENT_RETENTION_DAYS = 90; + +export default { + async fetch(request, env, ctx) { + // Handle CORS preflight requests + if (request.method === "OPTIONS") { + return handleCORS(); + } + + // Only accept POST requests to /telemetry + if (request.method !== "POST" || !request.url.endsWith('/telemetry')) { + return new Response('Not Found', { + status: 404, + headers: getCORSHeaders() + }); + } + + try { + // Parse and validate payload + const payload = await request.json(); + const validationError = validatePayload(payload); + if (validationError) { + return new Response(JSON.stringify({ error: validationError }), { + status: 400, + headers: { + 'Content-Type': 'application/json', + ...getCORSHeaders() + } + }); + } + + // Extract client info + const clientIP = request.headers.get('CF-Connecting-IP') || 'unknown'; + const telemetryId = payload.telemetry_id; + const eventId = payload.event_id; + + // Check idempotency - has this event been seen before? + const isDuplicate = await checkEventExists(env.DB, eventId); + if (isDuplicate) { + // Return success for idempotency - client doesn't need to retry + return new Response(JSON.stringify({ + status: 'ok', + message: 'Event already recorded' + }), { + status: 200, + headers: { + 'Content-Type': 'application/json', + ...getCORSHeaders() + } + }); + } + + // Check rate limits + const idRateLimitExceeded = await checkRateLimit( + env.KV, + `rate:id:${telemetryId}`, + ID_RATE_LIMIT + ); + + if (idRateLimitExceeded) { + return new Response(JSON.stringify({ + error: 'Rate limit exceeded for telemetry ID', + retry_after: RATE_LIMIT_WINDOW + }), { + status: 429, + headers: { + 'Content-Type': 'application/json', + 'Retry-After': RATE_LIMIT_WINDOW.toString(), + ...getCORSHeaders() + } + }); + } + + const ipHash = await hashIP(clientIP); + const ipRateLimitExceeded = await checkRateLimit( + env.KV, + `rate:ip:${ipHash}`, + IP_RATE_LIMIT + ); + + if (ipRateLimitExceeded) { + return new Response(JSON.stringify({ + error: 'Rate limit exceeded for IP address', + retry_after: RATE_LIMIT_WINDOW + }), { + status: 429, + headers: { + 'Content-Type': 'application/json', + 'Retry-After': RATE_LIMIT_WINDOW.toString(), + ...getCORSHeaders() + } + }); + } + + // Store the event in D1 + await storeEvent(env.DB, payload, clientIP); + + // Increment rate limit counters + await incrementRateLimit(env.KV, `rate:id:${telemetryId}`); + await incrementRateLimit(env.KV, `rate:ip:${ipHash}`); + + // Return success + return new Response(JSON.stringify({ + status: 'ok', + event_id: eventId + }), { + status: 200, + headers: { + 'Content-Type': 'application/json', + ...getCORSHeaders() + } + }); + + } catch (error) { + console.error('Error processing telemetry:', error); + + return new Response(JSON.stringify({ + error: 'Internal server error', + message: error.message + }), { + status: 500, + headers: { + 'Content-Type': 'application/json', + ...getCORSHeaders() + } + }); + } + }, + + // Scheduled handler for cleanup tasks + async scheduled(event, env, ctx) { + // Clean up old events (beyond retention period) + const cutoffDate = new Date(); + cutoffDate.setDate(cutoffDate.getDate() - EVENT_RETENTION_DAYS); + + try { + await env.DB.prepare( + 'DELETE FROM events WHERE created_at < ?' + ).bind(cutoffDate.toISOString()).run(); + + console.log(`Cleaned up events older than ${EVENT_RETENTION_DAYS} days`); + } catch (error) { + console.error('Error during cleanup:', error); + } + } +}; + +/** + * Validate telemetry payload structure. + * + * Expected structure: + * { + * "event_id": "string", + * "telemetry_id": "string (UUID)", + * "timestamp": "ISO 8601 datetime string", + * "environment": { + * "os": "string", + * "version": "string", + * "installation_method": "string", + * "mcp_host": "string", + * "execution_source": "string" + * }, + * "usage": { + * "total_sessions": number, + * "duration_distribution": {...}, + * "transport_usage": {...}, + * "provider_usage": {...} + * } + * } + */ +function validatePayload(payload) { + if (!payload) { + return 'Missing payload'; + } + + // Required top-level fields + if (!payload.event_id || typeof payload.event_id !== 'string') { + return 'Invalid or missing event_id'; + } + + if (!payload.telemetry_id || typeof payload.telemetry_id !== 'string') { + return 'Invalid or missing telemetry_id'; + } + + // Validate telemetry_id is a valid UUID format + const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; + if (!uuidRegex.test(payload.telemetry_id)) { + return 'Invalid telemetry_id format (must be UUID)'; + } + + if (!payload.timestamp || typeof payload.timestamp !== 'string') { + return 'Invalid or missing timestamp'; + } + + // Validate timestamp is valid ISO 8601 + const timestamp = new Date(payload.timestamp); + if (isNaN(timestamp.getTime())) { + return 'Invalid timestamp format (must be ISO 8601)'; + } + + // Validate timestamp is not too far in the future or past + const now = new Date(); + const dayInMs = 86400000; + if (timestamp > new Date(now.getTime() + dayInMs)) { + return 'Timestamp too far in the future'; + } + if (timestamp < new Date(now.getTime() - (90 * dayInMs))) { + return 'Timestamp too old (max 90 days)'; + } + + // Required: environment object + if (!payload.environment || typeof payload.environment !== 'object') { + return 'Invalid or missing environment object'; + } + + const env = payload.environment; + if (!env.os || typeof env.os !== 'string') { + return 'Invalid or missing environment.os'; + } + if (!env.version || typeof env.version !== 'string') { + return 'Invalid or missing environment.version'; + } + + // Required: usage object + if (!payload.usage || typeof payload.usage !== 'object') { + return 'Invalid or missing usage object'; + } + + const usage = payload.usage; + if (typeof usage.total_sessions !== 'number') { + return 'Invalid or missing usage.total_sessions'; + } + + // All validation passed + return null; +} + +/** + * Check if an event already exists in the database. + */ +async function checkEventExists(db, eventId) { + const result = await db.prepare( + 'SELECT event_id FROM events WHERE event_id = ? LIMIT 1' + ).bind(eventId).first(); + + return result !== null; +} + +/** + * Check rate limit for a given key. + */ +async function checkRateLimit(kv, key, limit) { + const count = await kv.get(key); + if (!count) { + return false; // No previous requests + } + + return parseInt(count) >= limit; +} + +/** + * Increment rate limit counter. + */ +async function incrementRateLimit(kv, key) { + const current = await kv.get(key); + const newCount = current ? parseInt(current) + 1 : 1; + + // Store with TTL equal to the rate limit window + await kv.put(key, newCount.toString(), { + expirationTtl: RATE_LIMIT_WINDOW + }); +} + +/** + * Store telemetry event in D1 database. + */ +async function storeEvent(db, payload, clientIP) { + const hashedIP = await hashIP(clientIP); + + await db.prepare(` + INSERT INTO events ( + event_id, + telemetry_id, + timestamp, + environment, + usage, + client_ip_hash, + created_at + ) VALUES (?, ?, ?, ?, ?, ?, ?) + `).bind( + payload.event_id, + payload.telemetry_id, + payload.timestamp, + JSON.stringify(payload.environment), + JSON.stringify(payload.usage), + hashedIP, + new Date().toISOString() + ).run(); +} + +/** + * Hash IP address for privacy. + * Uses SHA-256 to anonymize while maintaining ability to rate limit. + */ +async function hashIP(ip) { + const encoder = new TextEncoder(); + const data = encoder.encode(ip); + const hashBuffer = await crypto.subtle.digest('SHA-256', data); + const hashArray = Array.from(new Uint8Array(hashBuffer)); + const hashHex = hashArray.map(b => b.toString(16).padStart(2, '0')).join(''); + return hashHex.substring(0, 16); // Use first 16 chars +} + +/** + * Get CORS headers. + */ +function getCORSHeaders() { + return { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'POST, OPTIONS', + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Max-Age': '86400', + }; +} + +/** + * Handle CORS preflight requests. + */ +function handleCORS() { + return new Response(null, { + status: 204, + headers: getCORSHeaders() + }); +} diff --git a/cloudflare-worker/wrangler.toml b/cloudflare-worker/wrangler.toml new file mode 100644 index 00000000..0c31e9bc --- /dev/null +++ b/cloudflare-worker/wrangler.toml @@ -0,0 +1,39 @@ +name = "voicemode-telemetry" +main = "worker.js" +compatibility_date = "2024-12-01" + +# Account ID - REPLACE with your Cloudflare account ID +# Find at: https://dash.cloudflare.com/ -> select account -> copy Account ID from right sidebar +account_id = "2d152ec3e57b70274d69583b496e31ad" + +# Workers paid plan required for longer CPU time and higher limits +# Free tier: 100,000 requests/day, 10ms CPU time +# Paid tier: Unlimited requests, 50ms CPU time, $5/month base + usage +# workers_dev = true # Uncomment to deploy to workers.dev subdomain for testing + +# KV namespace for rate limiting +# Create with: wrangler kv:namespace create "RATE_LIMITS" +# Then replace the ID below with the one from the command output +[[kv_namespaces]] +binding = "KV" +id = "cb2a5a76f7fb45e0830fadb70587c87d" + +# D1 database for telemetry events +# Create with: wrangler d1 create voicemode-telemetry +# Then replace the database_id below with the one from the command output +[[d1_databases]] +binding = "DB" +database_name = "voicemode-telemetry" +database_id = "f7681eaa-a907-40f9-8ee8-a284c9477bb1" + +# Cron trigger for cleanup tasks (runs daily at 2am UTC) +[triggers] +crons = ["0 2 * * *"] + +# Environment variables (optional) +[vars] +# ENVIRONMENT = "production" + +# Limits (commented out - using free tier defaults) +# [limits] +# cpu_ms = 10 # Free tier: 10ms, Paid tier: 50ms diff --git a/docs/reference/environment.md b/docs/reference/environment.md index 4e24eafd..87087489 100644 --- a/docs/reference/environment.md +++ b/docs/reference/environment.md @@ -133,6 +133,64 @@ Supported formats: `pcm`, `opus`, `mp3`, `wav`, `flac`, `aac` Log levels: `debug`, `info`, `warning`, `error`, `critical` +## Telemetry and Privacy + +VoiceMode includes optional, privacy-respecting telemetry to help improve the project. + +### Telemetry Configuration + +| Variable | Description | Default | Example | +|----------|-------------|---------|---------| +| `VOICEMODE_TELEMETRY` | Enable telemetry (`true`/`false`/`ask`) | `ask` | `true` | +| `DO_NOT_TRACK` | Universal opt-out (any value disables telemetry) | Not set | `1` | +| `VOICEMODE_TELEMETRY_ENDPOINT` | Custom telemetry endpoint URL | (project default) | `https://...` | + +### Opt-Out Methods + +There are several ways to disable telemetry: + +1. **Universal Standard**: Set `DO_NOT_TRACK=1` in your environment (this is the [Console Do Not Track](https://consoledonottrack.com/) standard) +2. **Explicit Opt-Out**: Set `VOICEMODE_TELEMETRY=false` +3. **CLI Prompt**: When first asked, choose "No" to opt out + +The precedence order is: +1. `DO_NOT_TRACK` overrides everything (if set to any value) +2. `VOICEMODE_TELEMETRY=true` or `VOICEMODE_TELEMETRY=false` for explicit preference +3. `VOICEMODE_TELEMETRY=ask` (default) prompts user on first interactive use + +### What IS Collected + +When telemetry is enabled, VoiceMode collects: + +- **Anonymous ID**: Random UUID generated on first run (no connection to identity) +- **Environment**: OS type, VoiceMode version, installation method (dev/uv/pip) +- **Usage statistics** (binned for privacy): + - Number of conversations (not content) + - Duration bins (e.g., "1-5 minutes", not exact times) + - Exchange counts (e.g., "6-10 exchanges", not exact counts) + - Provider usage (e.g., "kokoro", "whisper-local") + - Transport type (local/livekit) + +### What is NOT Collected + +VoiceMode telemetry never collects: + +- Voice recordings or audio content +- Transcribed text or conversation content +- File paths beyond anonymized patterns +- IP addresses (hashed for rate limiting only) +- User names, emails, or identifying information +- API keys or credentials +- Specific timestamps (only date-based aggregation) + +### Privacy Protections + +- **Binning**: Exact values are grouped into ranges to prevent fingerprinting +- **Anonymization**: All paths and error messages are sanitized +- **Rate Limiting**: Maximum 10 events per hour per anonymous ID +- **Retention**: Data automatically deleted after 90 days +- **No PII**: Designed from the ground up to avoid personal information + ## Advanced Features ### Emotional TTS diff --git a/features.json b/features.json new file mode 100644 index 00000000..c2059eba --- /dev/null +++ b/features.json @@ -0,0 +1,157 @@ +{ + "project": "VoiceMode Telemetry", + "version": "1.0.0", + "created": "2024-12-14", + "task_id": "VM-152", + "description": "Opt-in telemetry system for understanding VoiceMode usage patterns", + "features": [ + { + "id": "tel-001", + "category": "Research", + "priority": 1, + "description": "Research DO_NOT_TRACK and telemetry standards", + "steps": [ + "Research DO_NOT_TRACK environment variable standard", + "Research other telemetry opt-out conventions", + "Document precedence rules (DO_NOT_TRACK > VOICEMODE_TELEMETRY)" + ], + "status": "passing", + "last_updated": "2024-12-14", + "notes": "Report at research/tel-001-do-not-track.md. Key: DO_NOT_TRACK any value = opt-out, overrides VOICEMODE_TELEMETRY. Default telemetry to disabled." + }, + { + "id": "tel-002", + "category": "Research", + "priority": 1, + "description": "Research telemetry backend options", + "steps": [ + "Evaluate self-hosted options (S3+Athena, simple server)", + "Evaluate privacy-focused services (Plausible, PostHog)", + "Consider Cloudflare Workers for simple collection", + "Document pros/cons and recommend approach" + ], + "status": "passing", + "last_updated": "2024-12-14", + "notes": "Report at research/tel-002-backend-options.md. Recommendation: Cloudflare Workers + Analytics Engine. Free tier, easy rate limiting, 2-4hr setup." + }, + { + "id": "tel-003", + "category": "Core", + "priority": 2, + "description": "Implement anonymous ID generation", + "steps": [ + "Generate permanent UUID on first install", + "Store in ~/.voicemode/telemetry_id", + "Make available via config module" + ], + "status": "passing", + "last_updated": "2024-12-14", + "notes": "Implemented in voice_mode/config.py. Added get_telemetry_id() function that generates UUID on first run, stores in ~/.voicemode/telemetry_id with 0o600 permissions, validates on subsequent loads. Exposed as TELEMETRY_ID constant in config module." + }, + { + "id": "tel-004", + "category": "Core", + "priority": 2, + "description": "Add environment detection", + "steps": [ + "Detect OS type (platform.system())", + "Detect installation method (uv/pip/dev)", + "Detect MCP host (Claude Code, Cursor, etc.)", + "Detect source (MCP vs CLI)" + ], + "status": "passing", + "last_updated": "2024-12-14", + "notes": "Implemented in voice_mode/config.py. Added environment detection functions: get_os_type() detects OS via platform.system(), get_installation_method() detects dev/uv/pip install by checking git repo and sys.prefix, get_mcp_host() detects Claude Code/Cursor/Cline via env vars and process tree, get_execution_source() detects MCP vs CLI via stdin pipe check and argv. All functions use lazy caching in _environment_cache dict. Added get_environment_info() to retrieve all values. Tested successfully on Linux dev environment." + }, + { + "id": "tel-005", + "category": "Core", + "priority": 3, + "description": "Create telemetry module", + "steps": [ + "Create voice_mode/telemetry/__init__.py", + "Implement collector.py - gather from existing logs", + "Implement privacy.py - anonymization and binning", + "Implement client.py - send to endpoint" + ], + "status": "passing", + "last_updated": "2024-12-14", + "notes": "Implemented complete telemetry module at voice_mode/telemetry/. Created privacy.py with bin_duration(), bin_size(), anonymize_path(), anonymize_error_message(), and sanitize_version_string() for privacy-preserving data collection. Implemented collector.py with TelemetryCollector class that analyzes event logs (SESSION_START, TTS_START, STT_START, ERROR events) and conversation logs to extract: session counts/durations (binned <1min, 1-5min, 5-10min, 10-20min, 20-60min, >60min), exchanges per session (binned 0, 1-5, 6-10, 11-20, >20), TTS/STT provider usage (openai, kokoro, whisper-local, other), transport type (local/livekit), success/failure rates, and anonymized error types. Implemented client.py with TelemetryClient class for HTTP transmission with: deterministic event ID generation (SHA256 hash of telemetry_id+timestamp), retry logic with exponential backoff, offline queueing in ~/.voicemode/telemetry_queue/, rate limit handling (429 responses), and old event cleanup. Integrated with config.py for telemetry ID and environment detection. All imports tested successfully. Data collection tested against real logs showing 1503 sessions collected with duration/exchange distributions." + }, + { + "id": "tel-006", + "category": "Config", + "priority": 3, + "description": "Add telemetry configuration", + "steps": [ + "Add VOICEMODE_TELEMETRY with ask/true/false values", + "Add DO_NOT_TRACK support with override precedence", + "Update config.py to load telemetry settings" + ], + "status": "passing", + "last_updated": "2025-12-14", + "notes": "Implemented in voice_mode/config.py. Added VOICEMODE_TELEMETRY environment variable with values 'ask' (default), 'true', 'false' and normalization for boolean-style values (1/0, yes/no, on/off). Added DO_NOT_TRACK environment variable support - any value disables telemetry (universal opt-out standard). Added VOICEMODE_TELEMETRY_ENDPOINT for backend URL configuration. Implemented is_telemetry_enabled() function with correct precedence: DO_NOT_TRACK overrides everything, then VOICEMODE_TELEMETRY values, default to disabled (privacy-first). Implemented get_telemetry_status() function that returns detailed status including enabled state, reason, endpoint, and telemetry ID. All precedence scenarios tested and verified: DO_NOT_TRACK=1 overrides VOICEMODE_TELEMETRY=true, explicit opt-in/opt-out works correctly, 'ask' defaults to disabled until user opts in. Integration tested with telemetry module - successfully imports and uses config functions." + }, + { + "id": "tel-007", + "category": "UX", + "priority": 4, + "description": "Implement telemetry opt-in prompts", + "steps": [ + "Add CLI prompt when VOICEMODE_TELEMETRY=ask", + "Create MCP resource for LLM to present opt-in", + "Add tool for LLM to set telemetry preference", + "Update voicemode.env after user choice" + ], + "status": "passing", + "last_updated": "2025-12-14", + "notes": "Implemented complete telemetry opt-in system. Created voice_mode/resources/telemetry.py with two MCP resources: voicemode://telemetry/status (detailed telemetry status and information for LLMs) and voicemode://telemetry/opt-in-prompt (user-friendly prompt text). Created voice_mode/tools/telemetry_management.py with two MCP tools: telemetry_set_preference(enabled=bool) to save user choice to ~/.voicemode/voicemode.env and telemetry_check_status() for quick status checks. Created voice_mode/utils/telemetry_prompt.py with CLI utilities: should_prompt_for_telemetry(), prompt_for_telemetry_consent(), and save_telemetry_preference(). Integrated prompts into CLI by modifying voice_mode/cli.py to call maybe_prompt_for_telemetry() for interactive CLI commands (skips MCP server mode). The prompt explains what data is collected, privacy protections, and asks user to opt-in or opt-out. User preference is saved to config file with proper commenting and formatting. All functionality tested: resources registered and accessible, tools work correctly, config file updates properly, CLI integration works. Resources and tools follow existing VoiceMode patterns (FastMCP decorators, async functions, detailed docstrings)." + }, + { + "id": "tel-008", + "category": "Backend", + "priority": 5, + "description": "Implement telemetry endpoint", + "steps": [ + "Set up chosen backend (per tel-002 research)", + "Implement rate limiting per anonymous ID", + "Implement IP-based rate limiting", + "Add payload validation", + "Ensure idempotency via event keys" + ], + "status": "passing", + "last_updated": "2025-12-14", + "notes": "Implemented complete Cloudflare Worker backend at cloudflare-worker/. Created worker.js with: payload validation (event_id, telemetry_id UUID format, timestamp ISO 8601 validation, environment and usage object validation), rate limiting via KV namespace (10 events/hour per telemetry_id, 100 events/hour per IP), idempotency via D1 database event_id primary key check, IP hashing for privacy (SHA-256), CORS support, scheduled cleanup job (daily at 2am UTC to remove events >90 days). Created schema.sql for D1 database with events table and indexes for efficient querying (telemetry_id, timestamp, composite indexes). Created wrangler.toml configuration with KV and D1 bindings, cron triggers. Created comprehensive README.md with setup instructions, cost estimation ($0-5/month free tier), example queries (DAU, retention, provider usage), monitoring guidance. Created DEPLOYMENT.md checklist for step-by-step deployment process. Created test-endpoint.sh script to validate all functionality (valid payload, idempotency, validation errors, rate limiting, CORS). Created package.json for npm scripts and .gitignore for security. Worker is production-ready following Cloudflare best practices. User needs to: install wrangler CLI, authenticate with Cloudflare, create KV namespace and D1 database, update wrangler.toml with IDs, run schema.sql, deploy with 'wrangler deploy', configure VoiceMode with VOICEMODE_TELEMETRY_ENDPOINT. All 5 steps completed successfully." + }, + { + "id": "tel-009", + "category": "Testing", + "priority": 6, + "description": "Dogfood telemetry locally", + "steps": [ + "Run telemetry collector on local logs", + "Verify data points are correct", + "Test upload to staging endpoint", + "Confirm privacy binning works" + ], + "status": "passing", + "last_updated": "2025-12-14", + "notes": "Full end-to-end dogfooding complete. Updated collector to use conversation logs (exchanges_*.jsonl) instead of event logs for accurate multi-exchange conversation tracking. Collector now correctly identifies 761 conversations with 19,081 exchanges. Added _normalize_provider_name() for consistent provider names (kokoro, whisper-local, openai). Privacy binning verified: duration bins (<1min, 1-5min, 5-10min, 10-20min, 20-60min, >60min), exchange bins (0, 1-5, 6-10, 11-20, >20). Deployed Cloudflare Worker to https://voicemode-telemetry.late-limit-5e4c.workers.dev/telemetry with D1 database and KV rate limiting. Successfully sent real telemetry from local logs to production endpoint. Data stored and queryable via wrangler d1 execute. All 6 endpoint tests pass (valid payload, idempotency, validation, rate limiting, CORS)." + }, + { + "id": "tel-010", + "category": "Docs", + "priority": 7, + "description": "Document telemetry and privacy", + "steps": [ + "Add Privacy section to README", + "Document what is/isn't collected", + "Document opt-out methods", + "Add telemetry config to docs" + ], + "status": "passing", + "last_updated": "2025-12-14", + "notes": "Added comprehensive 'Telemetry and Privacy' section to docs/reference/environment.md. Documents: configuration variables (VOICEMODE_TELEMETRY, DO_NOT_TRACK, VOICEMODE_TELEMETRY_ENDPOINT), three opt-out methods (DO_NOT_TRACK standard, explicit false, CLI prompt), precedence rules, what IS collected (anonymous ID, environment, binned usage stats), what is NOT collected (audio, text, paths, IP, PII, credentials), and privacy protections (binning, anonymization, rate limiting, retention, no PII). Links to Console Do Not Track standard for universal opt-out." + } + ] +} diff --git a/progress.json b/progress.json new file mode 100644 index 00000000..b6b2c136 --- /dev/null +++ b/progress.json @@ -0,0 +1,29 @@ +{ + "workflow_id": "VM-152-telemetry-harness", + "task_id": "VM-152", + "started": "2024-12-14", + "current_phase": "complete", + "current_feature": null, + "features_completed": 10, + "features_total": 10, + "sessions": [ + { + "session_id": 1, + "started": "2024-12-14T12:30:00", + "ended": "2024-12-14T14:00:00", + "features_worked": ["tel-001", "tel-002", "tel-003", "tel-004", "tel-005", "tel-006", "tel-007"], + "summary": "Voice-driven session. Planned MVP scope, spawned worker agents for implementation. Completed: DO_NOT_TRACK research (tel-001), backend research recommending Cloudflare Workers (tel-002), anonymous ID generation (tel-003), environment detection for OS/install/MCP host (tel-004), full telemetry module with collector/privacy/client (tel-005), telemetry config with ask/true/false and DO_NOT_TRACK override (tel-006), and opt-in prompts for CLI and MCP (tel-007)." + }, + { + "session_id": 2, + "started": "2025-12-14T04:30:00", + "ended": "2025-12-14T05:30:00", + "features_worked": ["tel-008", "tel-009", "tel-010"], + "summary": "Voice-driven session to complete remaining features. Deployed Cloudflare Worker backend (tel-008): fixed async hashIP bug, deployed to https://voicemode-telemetry.late-limit-5e4c.workers.dev with D1 database and KV rate limiting, all 6 tests passing. Dogfooding (tel-009): updated collector to use conversation logs for accurate multi-exchange tracking (761 conversations, 19,081 exchanges), added provider name normalization, verified privacy binning, sent real telemetry to production endpoint. Documentation (tel-010): added comprehensive Telemetry and Privacy section to docs/reference/environment.md covering config, opt-out methods, data collection, and privacy protections." + } + ], + "context_summaries": { + "1": "Session 1: Completed 7/10 features via voice conversation + worker agents. Key deliverables: telemetry module (voice_mode/telemetry/), config integration (VOICEMODE_TELEMETRY, DO_NOT_TRACK), opt-in UX (CLI prompts, MCP tools/resources). Research confirmed Cloudflare Workers for backend. Remaining: tel-008 (backend), tel-009 (dogfood), tel-010 (docs). Harness pattern working well with Task tool spawning workers.", + "2": "Session 2: Completed final 3 features. Deployed Cloudflare Worker to production with D1 database and KV rate limiting. Improved collector to use conversation logs for accurate session tracking. Added privacy documentation. All 10/10 features now passing. Telemetry system is production-ready with endpoint at https://voicemode-telemetry.late-limit-5e4c.workers.dev/telemetry." + } +} diff --git a/research/tel-001-do-not-track.md b/research/tel-001-do-not-track.md new file mode 100644 index 00000000..407a34c6 --- /dev/null +++ b/research/tel-001-do-not-track.md @@ -0,0 +1,569 @@ +# TEL-001: DO_NOT_TRACK and Telemetry Opt-Out Standards Research + +**Date**: 2025-12-14 +**Project**: VoiceMode Telemetry System +**Task**: VM-152 - Add telemetry and analytics system +**Author**: Research Report + +## Executive Summary + +This report examines the DO_NOT_TRACK environment variable standard and telemetry opt-out conventions for CLI tools. The findings recommend that VoiceMode implement a layered approach respecting both DO_NOT_TRACK (universal opt-out) and VOICEMODE_TELEMETRY (tool-specific control), with DO_NOT_TRACK taking precedence to honor user privacy preferences. + +## 1. The DO_NOT_TRACK Environment Variable Standard + +### 1.1 Overview + +DO_NOT_TRACK is a proposed universal environment variable for CLI applications that mirrors the browser DNT (Do Not Track) HTTP header. It provides a single, standard way for users to opt out of telemetry across all supporting tools. + +**Official Resources:** +- Primary specification: https://consoledonottrack.com/ +- Additional documentation: https://do-not-track.dev/ + +### 1.2 Core Principle + +From the Console Do Not Track proposal: + +> "This is a proposal for a single, standard environment variable that plainly and unambiguously expresses LACK OF CONSENT by a user of that software to any non-essential-to-functionality requests of any kind to the creator of the software or other tracking services." + +### 1.3 Accepted Values + +**Implementation Pattern**: The standard specifies that DO_NOT_TRACK should be checked for **presence**, not a specific value. + +According to the specification: +- **If the environment variable is set to any value**, telemetry should be disabled +- Common values used: `1`, `true`, or any non-empty string +- The mere presence of the variable indicates lack of consent + +**Examples from the wild:** +```bash +DO_NOT_TRACK=1 # Most common +DO_NOT_TRACK=true # Also acceptable +DO_NOT_TRACK=yes # Also acceptable +DO_NOT_TRACK=anything # Still indicates opt-out +``` + +### 1.4 Implementation Guidelines + +**Basic check pattern:** +```python +import os + +# Check if DO_NOT_TRACK is set (regardless of value) +if os.getenv('DO_NOT_TRACK'): + # Disable all telemetry + telemetry_enabled = False +``` + +**Important notes:** +- The presence of the variable is what matters, not its value +- An empty string (`DO_NOT_TRACK=`) may be interpreted differently by implementations +- Most tools treat any non-empty value as "do not track" + +### 1.5 Relationship to Browser DNT Header + +The browser DNT header historically used specific values: +- `"1"` = Do not track (DNT enabled) +- `"0"` = User consents to tracking +- `null` or `"unspecified"` = No preference set + +However, the browser DNT specification has been **discontinued** as of 2024 because: +- It was a cooperative feature with no enforcement +- Advertisement websites ignored the header +- The mechanism design was fundamentally flawed + +The CLI DO_NOT_TRACK convention learned from this by making the semantics simpler: **presence = opt-out**. + +## 2. Other Common Telemetry Opt-Out Conventions + +### 2.1 The NO_COLOR Pattern + +**Standard**: https://no-color.org/ + +NO_COLOR provides an analogous pattern for disabling ANSI color output: + +> "All command-line software which outputs text with ANSI color added should check for the presence of a NO_COLOR environment variable that, when present (regardless of its value), prevents the addition of ANSI color." + +**Key similarities to DO_NOT_TRACK:** +- Checks for **presence**, not specific value +- Universal standard across tools +- Simple, clear semantics + +**Related standards:** +- `FORCE_COLOR` - Forces color output even when piped +- `CLICOLOR` / `CLICOLOR_FORCE` - Older color control variables + +**Adoption**: NO_COLOR is widely adopted since util-linux version 2.41 and many other tools. + +### 2.2 Tool-Specific Environment Variables + +Most CLI tools use tool-specific environment variables following common naming patterns: + +#### Pattern 1: `[TOOL]_TELEMETRY_OPTOUT` +```bash +DOTNET_CLI_TELEMETRY_OPTOUT=1 # .NET SDK +PP_TOOLS_TELEMETRY_OPTOUT=1 # Microsoft Power Platform +DOTNET_UPGRADEASSISTANT_TELEMETRY_OPTOUT=1 # .NET Upgrade Assistant +``` + +#### Pattern 2: `[TOOL]_TELEMETRY_DISABLED` +```bash +NEXT_TELEMETRY_DISABLED=1 # Next.js +GATSBY_TELEMETRY_DISABLED=1 # Gatsby +NUXT_TELEMETRY_DISABLED=1 # Nuxt.js +STORYBOOK_DISABLE_TELEMETRY=1 # Storybook +ASTRO_TELEMETRY_DISABLED=1 # Astro +TURBO_TELEMETRY_DISABLED=1 # Turbo (also supports DO_NOT_TRACK) +``` + +#### Pattern 3: `[TOOL]_DISABLE_[FEATURE]` +```bash +CDK_DISABLE_CLI_TELEMETRY=true # AWS CDK (starting Dec 2025) +``` + +#### Pattern 4: `[TOOL]_NO_[FEATURE]` +```bash +HOMEBREW_NO_ANALYTICS=1 # Homebrew +``` + +#### Pattern 5: `[TOOL]_SEND_ANONYMOUS_USAGE_STATS` +```bash +DBT_SEND_ANONYMOUS_USAGE_STATS=False # dbt (also supports DO_NOT_TRACK) +``` + +**Common value conventions:** +- `1` or `true` for boolean variables +- Some tools accept any truthy value +- Case sensitivity varies by tool + +### 2.3 Configuration File Methods + +Many tools also support disabling telemetry via configuration files: + +**AWS CDK**: `cdk.json` or `~/.cdk.json` +```json +{ + "cli-telemetry": false +} +``` + +**Google Cloud Cortex**: `config.json` +```json +{ + "allowTelemetry": false +} +``` + +**Google Cloud SDK**: Command-based config +```bash +gcloud config set disable_usage_reporting true +``` + +### 2.4 Command-Line Flags + +Some tools offer per-invocation opt-out: + +```bash +netlify --telemetry-disable # Netlify CLI +turbo telemetry disable # Turbo (persistent) +``` + +## 3. Examples of Popular CLI Tools + +### 3.1 Tools Supporting DO_NOT_TRACK + +| Tool | DO_NOT_TRACK Support | Tool-Specific Variable | Notes | +|------|---------------------|------------------------|-------| +| **dbt** | Yes | `DBT_SEND_ANONYMOUS_USAGE_STATS=False` | DO_NOT_TRACK=1 equivalent to DBT variable | +| **Turbo (Vercel)** | Yes | `TURBO_TELEMETRY_DISABLED=1` | Supports both standards | +| **vLLM** | Yes | `VLLM_DO_NOT_TRACK` | Checks both VLLM and generic DO_NOT_TRACK | +| **Meteor** | Yes | - | Any truthy value disables stats | +| **FerretDB** | Yes | - | Respects DO_NOT_TRACK flag | +| **Bun** | Yes | - | Respects DO_NOT_TRACK flag | + +### 3.2 Tools NOT Supporting DO_NOT_TRACK (Yet) + +| Tool | Tool-Specific Variable | GitHub Issues | +|------|------------------------|---------------| +| **npm** | - | [npm/feedback#481](https://github.com/npm/feedback/discussions/481) - Requested | +| **Netlify CLI** | `--telemetry-disable` flag | [netlify/cli#737](https://github.com/netlify/cli/issues/737) - Requested | +| **Homebrew** | `HOMEBREW_NO_ANALYTICS=1` | Pre-dates DO_NOT_TRACK standard | +| **Next.js** | `NEXT_TELEMETRY_DISABLED=1` | - | +| **AWS CDK** | `CDK_DISABLE_CLI_TELEMETRY=true` | Starting Dec 12, 2025 | + +### 3.3 Tool Adoption Timeline + +- **2017-2021**: Most tools used tool-specific variables only +- **2021**: Console Do Not Track standard proposed ([Hacker News discussion](https://news.ycombinator.com/item?id=27746587)) +- **2022-2024**: Gradual adoption by newer tools +- **2024-2025**: Increasing awareness but still not universal + +**Key insight**: DO_NOT_TRACK is gaining traction but is not yet universally adopted. Tools should support both DO_NOT_TRACK and their own tool-specific variables. + +## 4. Recommended Precedence Rules for VoiceMode + +### 4.1 Decision Hierarchy + +Based on best practices and user expectations, VoiceMode should implement the following precedence: + +``` +1. DO_NOT_TRACK (if set) → ALWAYS disable telemetry +2. VOICEMODE_TELEMETRY (if set) → Explicit user preference +3. Interactive prompt (first run) → Opt-in consent +4. Default → Telemetry DISABLED (privacy-first) +``` + +### 4.2 Value Combination Matrix + +| DO_NOT_TRACK | VOICEMODE_TELEMETRY | Resulting Behavior | Rationale | +|--------------|---------------------|-------------------|-----------| +| **Set** (any value) | Not set | DISABLED | Universal opt-out takes precedence | +| **Set** (any value) | `false` / `0` | DISABLED | Universal opt-out overrides tool preference | +| **Set** (any value) | `true` / `1` | DISABLED | Universal opt-out is strongest signal | +| Not set | `true` / `1` | ENABLED | Explicit opt-in honored | +| Not set | `false` / `0` | DISABLED | Explicit opt-out honored | +| Not set | Not set | **DISABLED** (default) | Privacy-first default | + +### 4.3 Implementation Logic + +```python +import os + +def should_enable_telemetry() -> bool: + """ + Determine if telemetry should be enabled based on environment + and user preferences. + + Precedence: + 1. DO_NOT_TRACK - universal opt-out (highest priority) + 2. VOICEMODE_TELEMETRY - tool-specific preference + 3. Stored consent from interactive prompt + 4. Default to disabled (privacy-first) + """ + # 1. Check DO_NOT_TRACK (any value means opt-out) + if os.getenv('DO_NOT_TRACK'): + return False + + # 2. Check tool-specific VOICEMODE_TELEMETRY + voicemode_telemetry = os.getenv('VOICEMODE_TELEMETRY', '').lower() + if voicemode_telemetry in ('1', 'true', 'yes', 'on'): + return True + if voicemode_telemetry in ('0', 'false', 'no', 'off'): + return False + + # 3. Check stored user consent (from config file) + stored_consent = get_stored_telemetry_consent() + if stored_consent is not None: + return stored_consent + + # 4. Default to disabled (privacy-first) + return False +``` + +### 4.4 Configuration File Interaction + +The configuration file should store explicit consent given via: +- Interactive prompt on first run +- `voicemode config set VOICEMODE_TELEMETRY true/false` +- Direct config file editing + +**Priority order:** +1. Environment variable DO_NOT_TRACK (overrides everything) +2. Environment variable VOICEMODE_TELEMETRY (overrides config file) +3. Config file setting +4. Default (disabled) + +This allows: +- Global opt-out via `DO_NOT_TRACK` in shell profile +- Per-session override via `VOICEMODE_TELEMETRY=true voicemode ...` +- Persistent preference via config file + +## 5. Best Practices for Respecting User Privacy + +### 5.1 GDPR Compliance Requirements + +**Critical requirements for telemetry under GDPR:** + +1. **Opt-in by Default** + - Telemetry MUST be disabled by default + - User must actively consent (opt-in), not just fail to opt-out + - Pre-checked boxes are NOT compliant + +2. **Informed Consent** + - Clearly explain what data is collected + - Explain specific purposes (vague purposes insufficient) + - Show before collection begins + +3. **Easy Revocation** + - Disabling telemetry must be as easy as enabling it + - Provide multiple methods (env var, config, command) + +4. **Data Minimization** + - Only collect data actually needed + - Avoid collecting personally identifiable information (PII) + +5. **Transparency** + - Document telemetry in privacy policy + - Make telemetry status visible to users + +**Penalties**: GDPR violations can result in fines up to €20 million or 4% of annual worldwide turnover. + +### 5.2 Privacy-First Principles + +#### Data Collection Guidelines + +**DO collect:** +- Anonymous usage statistics (command invoked, success/failure) +- Performance metrics (execution time, resource usage) +- Error types (not error messages with potential PII) +- Feature usage counts +- Python version, OS type (generalized) + +**DO NOT collect:** +- File paths (may contain usernames) +- File contents or code snippets +- Error messages with stack traces (may contain paths/data) +- Environment variables (except telemetry-related) +- Network information beyond basic connectivity +- Usernames, emails, or any PII +- Git commit messages or branch names + +#### Anonymization Strategies + +1. **Hash user identifiers**: Use one-way hashes for any user/machine IDs +2. **Aggregate data**: Report counts, not individual events when possible +3. **Strip paths**: Remove or generalize file system paths +4. **Redact content**: Never include user data in telemetry payloads +5. **Session IDs**: Use random session IDs, not machine IDs + +### 5.3 Transparency Best Practices + +#### First-Run Experience + +``` +Welcome to VoiceMode! + +VoiceMode collects anonymous usage data to help improve the tool. + +What we collect: + - Commands used and their success/failure + - Performance metrics (execution time) + - Error types (not your data or file paths) + - Python and OS version + +What we DON'T collect: + - Your code, files, or file paths + - Personal information + - Environment variables + +You can: + - Opt in now (default: disabled) + - Review privacy policy: https://voicemode.dev/privacy + - Disable anytime: voicemode config set VOICEMODE_TELEMETRY false + - Or set DO_NOT_TRACK=1 in your shell + +Enable anonymous telemetry? [y/N]: +``` + +#### Status Visibility + +Users should be able to check telemetry status: + +```bash +$ voicemode telemetry status +Telemetry: DISABLED +Reason: DO_NOT_TRACK environment variable is set + +$ voicemode telemetry status +Telemetry: ENABLED +Reason: User consent via interactive prompt (2024-12-14) +Override: Set DO_NOT_TRACK=1 or VOICEMODE_TELEMETRY=false +``` + +#### Documentation Requirements + +1. **Privacy Policy**: Detailed data collection disclosure +2. **README**: Mention telemetry and how to disable +3. **Installation docs**: Explain opt-in process +4. **Config docs**: Document all telemetry controls +5. **Help text**: Include telemetry commands in `--help` + +### 5.4 Technical Implementation Best Practices + +#### Fail-Safe Defaults + +```python +# If anything goes wrong detecting preferences, default to disabled +try: + telemetry_enabled = should_enable_telemetry() +except Exception: + telemetry_enabled = False # Fail safe +``` + +#### Non-Blocking Telemetry + +- Telemetry should NEVER slow down the tool +- Send asynchronously in background +- Set short timeouts (1-2 seconds max) +- Silently fail if endpoint unreachable +- Don't retry failed sends + +#### Respect Network Conditions + +- Check for connectivity before sending +- Respect offline mode +- Don't send over metered connections (if detectable) + +#### Audit Trail + +- Log when consent is given/revoked (locally) +- Include timestamp and method of consent +- Allow users to export their telemetry data + +### 5.5 Ethical Considerations + +1. **User Trust**: Respect DO_NOT_TRACK even when not legally required +2. **Progressive Disclosure**: Don't collect more data than initially disclosed +3. **Purpose Limitation**: Only use data for stated purposes +4. **Regular Review**: Audit collected data to ensure compliance +5. **Data Retention**: Delete old telemetry data (suggest 90-day retention) + +## 6. Recommendations for VoiceMode + +### 6.1 Implementation Checklist + +- [ ] **Support DO_NOT_TRACK** (universal opt-out) +- [ ] **Support VOICEMODE_TELEMETRY** (tool-specific control) +- [ ] **Implement precedence**: DO_NOT_TRACK > VOICEMODE_TELEMETRY > config > default +- [ ] **Default to disabled** (privacy-first, GDPR-compliant) +- [ ] **Interactive opt-in prompt** on first run +- [ ] **Config file storage** for persistent preference +- [ ] **Status command**: `voicemode telemetry status` +- [ ] **Enable command**: `voicemode telemetry enable` +- [ ] **Disable command**: `voicemode telemetry disable` +- [ ] **Privacy policy** documentation +- [ ] **README section** about telemetry +- [ ] **Anonymize data** (hash IDs, strip paths) +- [ ] **Non-blocking sends** (async, short timeout) +- [ ] **Audit logging** (consent events) + +### 6.2 Configuration Variables + +Recommended environment variable naming: + +```bash +# Primary control (tool-specific) +VOICEMODE_TELEMETRY=true|false|1|0 + +# Universal opt-out (respect this FIRST) +DO_NOT_TRACK=1 + +# Optional: telemetry endpoint override (for testing) +VOICEMODE_TELEMETRY_ENDPOINT=https://custom.endpoint/events + +# Optional: debug telemetry without sending +VOICEMODE_TELEMETRY_DEBUG=1 +``` + +### 6.3 Config File Schema + +`~/.voicemode/config/config.yaml`: + +```yaml +telemetry: + enabled: false # explicit user preference + consent_date: "2024-12-14T10:30:00Z" # when consent given + consent_method: "interactive_prompt" # or "config_set", "env_var" + anonymous_id: "hash-of-machine-id" # one-way hash for analytics + + # What to collect (granular control) + collect: + usage_stats: true + performance_metrics: true + error_types: true + feature_usage: true +``` + +### 6.4 Testing Strategy + +Test all combinations: + +```bash +# Test DO_NOT_TRACK override +DO_NOT_TRACK=1 VOICEMODE_TELEMETRY=true voicemode test +# Expected: Telemetry DISABLED (DO_NOT_TRACK wins) + +# Test explicit opt-in +VOICEMODE_TELEMETRY=true voicemode test +# Expected: Telemetry ENABLED + +# Test explicit opt-out +VOICEMODE_TELEMETRY=false voicemode test +# Expected: Telemetry DISABLED + +# Test default (no vars set) +voicemode test +# Expected: Telemetry DISABLED (privacy-first default) + +# Test config file +voicemode config set VOICEMODE_TELEMETRY true +voicemode test +# Expected: Telemetry ENABLED + +# Test env var override of config +VOICEMODE_TELEMETRY=false voicemode test +# Expected: Telemetry DISABLED (env var overrides config) +``` + +## 7. References and Further Reading + +### Standards and Specifications +- [Console Do Not Track](https://consoledonottrack.com/) - Official DO_NOT_TRACK specification +- [DO_NOT_TRACK Dev](https://do-not-track.dev/) - Additional documentation +- [NO_COLOR Standard](https://no-color.org/) - Analogous standard for color output +- [FORCE_COLOR Standard](https://force-color.org/) - Related color control + +### Community Discussions +- [Console Do Not Track - Hacker News](https://news.ycombinator.com/item?id=27746587) - Community discussion +- [npm DO_NOT_TRACK Discussion](https://github.com/npm/feedback/discussions/481) - Feature request +- [Netlify CLI Issue](https://github.com/netlify/cli/issues/737) - Support request +- [dbt-core Issue](https://github.com/dbt-labs/dbt-core/issues/3540) - Implementation + +### Privacy and Compliance +- [Lawful Processing of Telemetry Data](https://www.activemind.legal/guides/telemetry-data/) - GDPR compliance guide +- [Best GDPR-Compliant Analytics Tools](https://posthog.com/blog/best-gdpr-compliant-analytics-tools) - Privacy-preserving alternatives +- [TelemetryDeck Privacy FAQ](https://telemetrydeck.com/docs/guides/privacy-faq/) - Privacy-first telemetry + +### Implementation Examples +- [.NET SDK Telemetry](https://learn.microsoft.com/en-us/dotnet/core/tools/telemetry) - Microsoft's approach +- [AWS CDK Telemetry](https://docs.aws.amazon.com/cdk/v2/guide/cli-telemetry.html) - AWS implementation +- [dbt Anonymous Usage Stats](https://docs.getdbt.com/reference/global-configs/usage-stats) - dbt's documentation +- [Next.js Telemetry](https://nextjs.org/telemetry) - Vercel's approach + +### Tools and Resources +- [toptout Repository](https://github.com/beatcracker/toptout) - Collection of telemetry opt-out methods +- [Telemetry Opt-Out Examples](https://makandracards.com/makandra/624560-disable-telemetry-various-open-source-tools-libraries) - Quick reference + +## 8. Conclusion + +The DO_NOT_TRACK standard provides a clear, universal mechanism for users to opt out of telemetry. While not yet universally adopted, it represents best practice for respecting user privacy preferences. + +**Key takeaways for VoiceMode:** + +1. **Respect DO_NOT_TRACK unconditionally** - This builds user trust and follows emerging standards +2. **Provide tool-specific control** - VOICEMODE_TELEMETRY for granular control +3. **Default to disabled** - Privacy-first and GDPR-compliant +4. **Be transparent** - Clear documentation and easy status checking +5. **Make it easy** - Multiple methods to enable/disable +6. **Fail safely** - When in doubt, disable telemetry + +By implementing these standards, VoiceMode will respect user privacy, comply with regulations, and align with community best practices. + +--- + +**Next Steps:** +1. Implement precedence logic in telemetry module +2. Add configuration file support for persistent preferences +3. Create interactive opt-in prompt +4. Document telemetry in README and privacy policy +5. Add telemetry status/enable/disable commands +6. Write comprehensive tests for all combinations diff --git a/research/tel-002-backend-options.md b/research/tel-002-backend-options.md new file mode 100644 index 00000000..3b7fc26b --- /dev/null +++ b/research/tel-002-backend-options.md @@ -0,0 +1,965 @@ +# Telemetry Backend Options for VoiceMode + +**Research Date:** 2025-12-14 +**Context:** VoiceMode Python CLI/MCP tool telemetry backend evaluation +**Requirements:** Anonymous usage data, opt-in only, rate limiting, idempotent events, simple queries (DAU, retention) +**Expected Scale:** < 1000 users for MVP + +## Executive Summary + +For VoiceMode's MVP telemetry needs, I recommend **Option 2: Cloudflare Workers with D1/Analytics Engine** as the best approach. It offers the optimal balance of simplicity, cost-effectiveness, built-in privacy features, and low maintenance burden while providing excellent scalability for future growth. + +**Runner-up:** Self-hosted FastAPI on a VPS is a solid alternative if you prefer full control and want to avoid vendor lock-in. + +--- + +## Option 1: Self-Hosted Simple Server (Flask/FastAPI on VPS) + +### Overview +Deploy a lightweight Python web service (Flask or FastAPI) on a VPS with SQLite or PostgreSQL backend for storing telemetry events. + +### Setup Complexity +**Medium (3/5)** + +- Requires VPS provisioning (DigitalOcean, Hetzner, Linode) +- Server configuration (nginx/caddy reverse proxy, SSL certificates) +- Application deployment (systemd service, supervisor, or Docker) +- Database setup (SQLite for simple, PostgreSQL for production) +- Rate limiting middleware installation (slowapi, fastapi-limiter) + +**Estimated Time:** 4-8 hours for initial setup + +### Cost at Small Scale (< 1000 users) +**$4-12/month** + +- VPS: $4-6/month (Hetzner CX11, DigitalOcean Basic Droplet - 1 vCPU, 2GB RAM) +- Domain: $1/month (optional, can use IP) +- SSL: $0 (Let's Encrypt) +- Backups: $1/month (optional automated backups) +- Redis (optional for distributed rate limiting): $0-5/month + +**At 1000 users with ~10 events/day:** ~300k events/month fits easily in cheapest tier. + +### Privacy Features +**Excellent** + +- Full control over data collection and retention +- Easy IP anonymization (hash or truncate before storage) +- Configurable data retention policies (delete old events via cron) +- No third-party data sharing +- Can implement opt-out tracking of opt-outs +- Geographic flexibility (choose server location) + +**Implementation:** Simple middleware for IP hashing, scheduled cleanup jobs. + +### Rate Limiting Capabilities +**Excellent** + +- Python libraries make this straightforward: + - **SlowAPI** (FastAPI/Starlette) - battle-tested, millions of requests/month in production + - **fastapi-limiter** - Redis-backed for distributed setups + - **fastapi-simple-rate-limiter** - In-memory for single instance + +- Can implement multi-tier limiting: + - Per anonymous ID (user device) + - Per IP address + - Global limits + +- Example: `@rate_limiter(limit=100, seconds=3600)` for 100 events/hour per client + +### Ease of Querying/Analyzing Data +**Good** + +- Direct SQL access for ad-hoc queries +- Simple Python scripts for DAU/retention calculations +- Can export to CSV/JSON for analysis +- Easy to add Grafana/Metabase for visualization + +**DAU Query Example:** +```sql +SELECT COUNT(DISTINCT anonymous_id) +FROM events +WHERE event_date = CURRENT_DATE; +``` + +**Limitations:** Manual query writing, no built-in analytics dashboards. + +### Maintenance Burden +**Medium-High (3.5/5)** + +- Server OS updates and security patches +- Application dependency updates +- Database backups and monitoring +- SSL certificate renewal (automated with certbot) +- Log rotation and disk space management +- Uptime monitoring + +**Ongoing Time:** 2-4 hours/month + +### Idempotency +**Easy to implement** + +- Use event_id as primary key or unique constraint +- Duplicate inserts fail silently or return existing record +- Can add `created_at` and `last_seen_at` for deduplication tracking + +### Pros +- Complete control over data and infrastructure +- No vendor lock-in +- Simple to understand and debug +- Easy to extend with custom features +- Predictable costs +- Can run locally for testing + +### Cons +- Requires sysadmin skills +- Ongoing maintenance responsibility +- Manual scaling (though not needed at this scale) +- Downtime risk if VPS fails +- Need to implement monitoring yourself + +### Recommendation for VoiceMode +**Good fit if:** You want maximum control and don't mind operational overhead. Best for teams with DevOps experience. + +--- + +## Option 2: Cloudflare Workers with KV or D1 + +### Overview +Serverless edge functions with Cloudflare's KV (key-value store) or D1 (SQLite database) for data storage, plus Analytics Engine for time-series data. + +### Setup Complexity +**Low-Medium (2/5)** + +- Cloudflare account setup (free tier available) +- Wrangler CLI installation (`npm install -g wrangler`) +- Write Worker function (JavaScript/TypeScript) +- Configure D1 database or Analytics Engine +- Deploy with `wrangler deploy` + +**Estimated Time:** 2-4 hours for initial setup + +**Modern Approach (2025):** Cloudflare now offers **Workers Observability** with query builder, shareable queries, and programmatic API access for custom integrations. + +### Cost at Small Scale (< 1000 users) +**$0-5/month** + +**Workers Free Tier:** +- 100,000 requests/day +- 10ms CPU time per request +- Sufficient for MVP + +**D1 Database (if needed):** +- Free tier: 25 billion row reads/month, 50 million row writes/month +- 5GB storage +- More than sufficient for < 1000 users + +**Analytics Engine (recommended for telemetry):** +- Designed specifically for high-cardinality analytics +- Unlimited cardinality (track any dimension) +- Built on ClickHouse +- Free tier likely covers MVP needs + +**At 1000 users, 10 events/day:** 300k events/month fits in free tier easily. + +### Privacy Features +**Excellent** + +- No IP logging by default in Analytics Engine +- Easy to implement IP hashing in Worker before storage +- Data residency control (EU, US regions available) +- Can set TTL on data for automatic deletion +- Cloudflare's privacy-focused positioning +- GDPR compliant infrastructure + +**Workers Observability (2025):** Built-in features for filtering and masking sensitive data. + +### Rate Limiting Capabilities +**Excellent** + +- Built-in rate limiting via Workers +- Can use KV or Durable Objects for distributed rate limiting +- Multiple strategies: + - Per IP (automatic with `request.cf.ipCountry`) + - Per anonymous ID (store in KV) + - Sliding window or fixed window + +**Workers Analytics Engine** specifically designed to handle high-volume, high-cardinality data without traditional rate limit concerns. + +### Ease of Querying/Analyzing Data +**Excellent** + +- **Analytics Engine:** SQL API for querying time-series data +- **Workers Observability (2025):** + - Query Builder with shareable links + - Workers Observability API for programmatic access + - Pre-built visualizations + - Integration with third-party tools (Grafana Cloud, OTLP endpoints) + +- **D1:** Standard SQL queries via API +- REST API for custom integrations +- Built-in dashboards in Cloudflare console + +**DAU Query Example (Analytics Engine):** +```sql +SELECT COUNT(DISTINCT blob1) as dau +FROM analytics_dataset +WHERE timestamp >= NOW() - INTERVAL '1' DAY; +``` + +### Maintenance Burden +**Very Low (1/5)** + +- Fully managed, serverless infrastructure +- Automatic scaling and availability +- No servers to patch or monitor +- Pay-as-you-go pricing +- Built-in DDoS protection + +**Ongoing Time:** < 1 hour/month (mostly code updates) + +### Idempotency +**Medium effort** + +- D1: Standard database constraints (unique keys) +- KV: Implement deduplication logic in Worker +- Analytics Engine: Designed for append-only, may require client-side deduplication + +**Pattern:** Use Durable Objects for idempotency tracking if needed. + +### Pros +- Extremely low maintenance +- Global edge network (low latency worldwide) +- Generous free tier +- Excellent rate limiting primitives +- Modern observability features (2025) +- Scales automatically +- Strong privacy features +- No servers to manage + +### Cons +- Vendor lock-in to Cloudflare +- JavaScript/TypeScript required (not Python) +- Analytics Engine query limitations vs full SQL +- Learning curve for Workers paradigm +- Less flexibility than self-hosted + +### Recommendation for VoiceMode +**Best fit for MVP.** Minimal setup, zero maintenance, built-in analytics, and free tier covers early growth. Modern observability features make this compelling for 2025. + +--- + +## Option 3: AWS S3 + Athena + +### Overview +Store telemetry events as JSON/Parquet files in S3, query with Athena (serverless SQL). + +### Setup Complexity +**Medium-High (4/5)** + +- AWS account setup and IAM configuration +- S3 bucket creation with lifecycle policies +- Athena database and table schema definition +- Data partitioning strategy (by date/hour) +- File format selection (JSON, Parquet, CSV) +- Optional: Glue crawler for schema discovery +- Lambda function for data ingestion (if not directly writing to S3) + +**Estimated Time:** 6-10 hours for proper setup + +### Cost at Small Scale (< 1000 users) +**$0.50-2/month** + +**S3 Storage:** +- $0.023/GB/month (first 50TB) +- At 300k events/month (avg 1KB each): ~300MB = $0.01/month +- With Parquet compression (3:1): ~$0.003/month + +**Athena Queries:** +- $5/TB scanned +- Minimum $0.58/month for lowest usage +- ~10 queries/month on 300MB data: ~$0.015 + +**S3 API Requests:** +- GET: $0.0004 per 1000 requests +- PUT: $0.005 per 1000 requests +- At 300k events: ~$1.50 for PUTs + +**Hidden Costs:** Data retrieval, transfer, scattered small files can increase costs. + +**Total Estimate:** $0.50-2/month, scales well but costs rise with query frequency. + +### Privacy Features +**Good** + +- Full control over data location (region selection) +- Easy to implement IP hashing before storage +- S3 lifecycle policies for automatic deletion +- Server-side encryption (SSE-S3, SSE-KMS) +- VPC endpoints for private access +- Compliance certifications (HIPAA, PCI-DSS) + +**Limitation:** Athena query results stored in S3 by default (need lifecycle rules to clean up). + +### Rate Limiting Capabilities +**Requires Additional Service** + +- S3 has no native rate limiting +- Need API Gateway + Lambda in front for rate limiting +- API Gateway: 10,000 requests/second default, throttling configurable +- Adds complexity and cost (~$1-3/month) + +**Alternative:** Client-side rate limiting (less reliable). + +### Ease of Querying/Analyzing Data +**Good for Analysts, Complex for Others** + +- Standard SQL via Athena console or API +- Supports complex queries, JOINs, window functions +- Can integrate with QuickSight, Tableau, Python (boto3) +- Query results available as CSV/JSON + +**DAU Query Example:** +```sql +SELECT COUNT(DISTINCT anonymous_id) as dau +FROM telemetry_events +WHERE year=2025 AND month=12 AND day=14; +``` + +**Challenges:** +- Schema evolution requires table updates +- Query performance depends on partitioning strategy +- Costs increase with data scanned (need partitioning and compression) +- Manual query writing required + +### Maintenance Burden +**Medium (3/5)** + +- Low infrastructure maintenance (serverless) +- Moderate data engineering required: + - Optimize partitioning as data grows + - Monitor and cleanup query results + - Update schemas for new events + - Implement file consolidation (avoid small files) + +- Cost monitoring important (easy to overspend on queries) +- S3 lifecycle policy management + +**Ongoing Time:** 2-3 hours/month + +### Idempotency +**Challenging** + +- S3 is append-only, no native deduplication +- Options: + 1. Athena query-time deduplication (DISTINCT, slower) + 2. Lambda preprocessing before S3 write (check DynamoDB) + 3. Client-side responsibility + +**Best Pattern:** Use event_id in filename or Lambda + DynamoDB for deduplication. + +### Pros +- Highly scalable (petabyte-scale) +- Pay only for what you use +- Standard SQL queries +- Rich ecosystem (Glue, QuickSight, etc.) +- Durable storage (99.999999999%) +- Good for long-term data archival + +### Cons +- Complex setup for simple use case +- Rate limiting requires additional services +- Idempotency is challenging +- Query costs can surprise you +- Small file overhead problem +- Requires data engineering knowledge +- Cold query latency (first query slow) + +### Recommendation for VoiceMode +**Overkill for MVP.** Better suited for high-scale (millions of events) or if already invested in AWS ecosystem. Complexity outweighs benefits at small scale. + +--- + +## Option 4: Privacy-Focused Analytics Services + +### Plausible Analytics (Self-Hosted Community Edition) + +#### Overview +Open-source, privacy-friendly web analytics platform built with Elixir/Phoenix, PostgreSQL, and ClickHouse. AGPL licensed. + +#### Setup Complexity +**Medium-High (4/5)** + +- Docker Compose deployment recommended +- Requires: PostgreSQL + ClickHouse databases +- Configuration via environment variables +- Reverse proxy setup (nginx/Caddy) +- SSL certificate management + +**Resource Requirements:** 4 vCPU, 16GB RAM, 30GB+ storage +**Estimated Time:** 4-6 hours for initial setup + +#### Cost at Small Scale +**$4-20/month** + +- VPS: $4-12/month (DigitalOcean, Hetzner - must meet resource requirements) +- Backups: $2-5/month +- Domain: $1/month (optional) + +**Note:** One user reports $4/month on DigitalOcean Droplet, but resource-intensive setup may need higher tier ($12-20/month) for reliable performance. + +**Cloud Alternative:** $9/month for 10k pageviews, 1 site (starter plan). Not cost-effective for CLI telemetry. + +#### Privacy Features +**Excellent (Best-in-Class)** + +- Built for privacy (GDPR, CCPA, PECR compliant) +- No cookies, no personal data collection +- IP anonymization built-in +- EU-hosted option available +- Open source, auditable code +- Can bypass ad blockers when self-hosted + +#### Rate Limiting Capabilities +**Not Built-In** + +- Designed for web analytics, not API telemetry +- Would need custom reverse proxy rate limiting (nginx limit_req) +- Not ideal for programmatic event submission + +#### Ease of Querying/Analyzing Data +**Web Analytics Focus** + +- Beautiful dashboards for pageviews, visitors, sources +- Custom events and goals supported +- Funnel analysis available +- Google Search Console integration +- Raw data access via ClickHouse SQL (self-hosted only) + +**Limitation:** Not designed for generic telemetry queries (DAU from CLI usage, retention cohorts). Would need custom ClickHouse queries. + +#### Maintenance Burden +**Medium-High (4/5)** + +- Database management (PostgreSQL + ClickHouse) +- Docker container updates +- Resource monitoring (ClickHouse can be memory-hungry) +- Backup management +- Community support only (no premium support) + +**Ongoing Time:** 3-5 hours/month + +#### Idempotency +**Not Designed For** + +- Built for web analytics (unique pageviews tracked by session) +- Would need custom implementation for event idempotency + +#### Recommendation for VoiceMode +**Not ideal.** Great for privacy-focused web analytics, but overkill and wrong tool for CLI telemetry. High resource requirements and web-analytics focus make it a poor fit. + +--- + +### PostHog (Community Edition Self-Hosted) + +#### Overview +All-in-one product analytics platform (analytics, session replay, feature flags, A/B testing). Community Edition is open source (MIT license). + +#### Setup Complexity +**High (4.5/5)** + +- Docker-based deployment (hobby instance) +- Requires 4GB+ RAM minimum +- PostgreSQL + ClickHouse + Redis + Kafka (full stack) +- Kubernetes option deprecated for new deployments + +**Estimated Time:** 6-12 hours for production-ready setup + +#### Cost at Small Scale +**$12-30/month** + +- VPS: $12-20/month (minimum 4 vCPU, 16GB RAM - Hetzner CCX23) +- Storage: Included (need 30GB+) +- Backups: $5-10/month + +**Scale Limits:** Hobby deployment scales to ~100k events/month, then PostHog recommends migrating to Cloud. + +#### Privacy Features +**Excellent** + +- Self-hosted = full data control +- Customer data stays on your servers +- Can circumvent ad blockers +- Compliance-friendly (GDPR, HIPAA with configuration) +- No third-party data sharing + +**FOSS Option:** `posthog-foss` repository for 100% open source without proprietary features. + +#### Rate Limiting Capabilities +**Application-Level** + +- Designed for high-volume event ingestion +- Built-in throttling for scale +- Not configurable per-client (enterprise feature) +- Would need reverse proxy for custom rate limiting + +#### Ease of Querying/Analyzing Data +**Excellent for Product Analytics** + +- Rich dashboards (funnels, retention, user paths) +- Custom events and properties +- SQL query interface (ClickHouse) +- Export capabilities +- Retention analysis built-in + +**Perfect for:** DAU, retention cohorts, feature usage tracking + +#### Maintenance Burden +**High (4.5/5)** + +- Complex multi-service stack +- Database maintenance (ClickHouse, PostgreSQL, Redis) +- Updates can be breaking +- Community support only (no premium support for CE) +- Resource-intensive + +**Ongoing Time:** 4-6 hours/month + +#### Idempotency +**Event Deduplication Available** + +- PostHog supports idempotency via event UUIDs +- Can configure deduplication windows +- Good fit for telemetry use case + +#### Recommendation for VoiceMode +**Overkill for MVP but future-proof.** If you plan to grow into feature flags, A/B testing, and rich product analytics, PostHog CE is worth the investment. For simple telemetry, it's too heavyweight. Resource requirements and maintenance burden make it impractical for < 1000 users. + +--- + +## Option 5: Simple Webhook to Google Sheets or Airtable + +### Overview +Use Google Sheets or Airtable as a database, accepting telemetry events via webhooks (Google Apps Script or Airtable API). + +### Setup Complexity +**Very Low (1/5)** + +**Google Sheets:** +- Create spreadsheet +- Write Apps Script webhook handler (~20 lines) +- Deploy as web app +- Set permissions + +**Airtable:** +- Create base +- Use Airtable API or webhook services (Zapier, Make, n8n) + +**Estimated Time:** 1-2 hours + +### Cost at Small Scale +**$0** + +**Google Sheets:** +- Free for personal use +- Google Workspace: $6/user/month (unnecessary for telemetry) + +**Airtable:** +- Free tier: 1,000 records, 1 GB attachments +- Plus: $10/seat/month for 5,000 records + +**At 1000 users, 10 events/day:** 300k events/month exceeds free tiers quickly. + +### Privacy Features +**Poor** + +- Data stored on third-party platforms +- Limited control over data residency +- Google/Airtable terms of service apply +- IP logging by default (Google Apps Script) +- Manual anonymization required +- Not GDPR-friendly without business tier + +### Rate Limiting Capabilities +**Very Poor (Deal Breaker)** + +**Google Sheets API Limits:** +- **Write requests: 60/minute per project** (hard limit) +- Read requests: 300/minute +- User quota: 100 requests/100 seconds +- Project quota: 500 requests/100 seconds +- Response: HTTP 429 on rate limit + +**At 1000 users, 10 events/day:** +- 300k events/month = ~7 events/minute average +- Peaks could exceed 60/minute easily +- **Not viable for real-time telemetry** + +**Airtable API Limits:** +- 5 requests/second per base +- ~300 requests/minute +- Better than Sheets but still limiting + +### Ease of Querying/Analyzing Data +**Good for Humans, Poor for Automation** + +**Google Sheets:** +- Spreadsheet interface familiar to everyone +- Pivot tables, charts, formulas +- Can share with stakeholders +- Export to CSV/Excel +- Google Data Studio integration + +**Limitations:** +- Max 10 million cells per spreadsheet +- Slow with > 100k rows +- No SQL interface +- Manual data analysis + +**Airtable:** +- Rich data types and views +- Better performance than Sheets +- Integrations with BI tools +- API for programmatic access + +### Maintenance Burden +**Low (1.5/5)** + +- No infrastructure to manage +- Apps Script auto-updates +- Storage managed by provider +- Need to monitor quota usage +- May need periodic archival (manual) + +**Ongoing Time:** 1-2 hours/month (mostly data cleanup) + +### Idempotency +**Manual Implementation Required** + +**Google Sheets:** +- Apps Script can check for duplicate event_id before insert +- Slow lookups (linear scan or query) +- Race conditions possible with concurrent requests + +**Airtable:** +- Better with linked records and unique field validation +- Still not designed for idempotent API use + +### Pros +- Zero cost for MVP +- Extremely simple setup +- No coding required (via Zapier/Make) +- Familiar interface for non-technical stakeholders +- Easy to export and share data +- Good for proof-of-concept + +### Cons +- **Rate limits are a deal breaker** (60 writes/min for Sheets) +- Poor scalability (100k+ rows = slow) +- Not designed for programmatic data collection +- Privacy concerns with third-party storage +- No built-in rate limiting or idempotency +- Response timeouts (Apps Script execution limits) +- Not suitable for production telemetry +- Security concerns (webhook verification needed) + +### Recommendation for VoiceMode +**Only for prototype/testing.** Good for validating event schema and testing client integration, but rate limits make it unsuitable for production. Would break with even modest usage. Not recommended for MVP. + +--- + +## Comparison Matrix + +| Criterion | Self-Hosted (FastAPI) | Cloudflare Workers | AWS S3+Athena | Plausible CE | PostHog CE | Google Sheets | +|-----------|----------------------|-------------------|---------------|--------------|------------|---------------| +| **Setup Complexity** | Medium (3/5) | Low-Medium (2/5) | Medium-High (4/5) | Medium-High (4/5) | High (4.5/5) | Very Low (1/5) | +| **Monthly Cost** | $4-12 | $0-5 | $0.50-2 | $4-20 | $12-30 | $0 | +| **Privacy** | Excellent | Excellent | Good | Excellent | Excellent | Poor | +| **Rate Limiting** | Excellent | Excellent | Requires Extra Service | Not Built-In | Application-Level | Very Poor (60/min) | +| **Query Ease** | Good (SQL) | Excellent (SQL+UI) | Good (SQL) | Web Analytics | Excellent (Analytics) | Good (Manual) | +| **Maintenance** | Medium-High (3.5/5) | Very Low (1/5) | Medium (3/5) | Medium-High (4/5) | High (4.5/5) | Low (1.5/5) | +| **Idempotency** | Easy | Medium | Challenging | Not Designed | Built-In | Manual | +| **Scalability** | Manual (Good) | Automatic (Excellent) | Excellent | Limited (100k events) | Limited (100k events) | Poor (10M cells) | +| **Vendor Lock-In** | None | High (Cloudflare) | Medium (AWS) | None | None | High (Google) | +| **Best For** | Control & Flexibility | MVP & Low Maintenance | Large Scale AWS Shops | Web Analytics | Full Product Analytics | Prototypes Only | + +--- + +## Detailed Recommendation for VoiceMode MVP + +### Winner: Cloudflare Workers + Analytics Engine + +**Why:** + +1. **Minimal Setup Effort (2-4 hours)** + - Simple Worker function deployment + - No server provisioning or maintenance + - Built-in observability (2025 features) + +2. **Cost-Effective ($0-5/month)** + - Free tier covers MVP entirely + - Predictable scaling costs + - No surprise charges + +3. **Built-In Privacy** + - Easy IP anonymization + - Data residency controls + - GDPR-compliant infrastructure + - No personal data required + +4. **Excellent Rate Limiting** + - Workers can enforce per-client limits + - Distributed rate limiting with Durable Objects + - Built-in DDoS protection + +5. **Perfect for Telemetry Use Case** + - Analytics Engine designed for high-cardinality event data + - SQL query interface for DAU/retention + - Modern observability tools (query builder, shareable queries) + - OTLP export for future third-party integrations + +6. **Low Maintenance Burden** + - Fully managed, serverless + - Automatic scaling + - No patching or monitoring + - Focus on product, not infrastructure + +**Trade-offs:** +- Vendor lock-in (mitigated by event data ownership) +- JavaScript/TypeScript instead of Python (small worker functions) +- Less flexibility than self-hosted (acceptable for MVP) + +**Implementation Path:** + +```javascript +// Simplified Worker example +export default { + async fetch(request, env) { + const event = await request.json(); + + // Rate limiting check (per anonymous_id) + const rateLimitKey = `rate:${event.anonymous_id}`; + const count = await env.KV.get(rateLimitKey); + if (count && parseInt(count) > 100) { + return new Response('Rate limit exceeded', { status: 429 }); + } + + // Idempotency check + const eventKey = `event:${event.event_id}`; + const existing = await env.KV.get(eventKey); + if (existing) { + return new Response('Event already recorded', { status: 200 }); + } + + // Store in Analytics Engine + await env.ANALYTICS.writeDataPoint({ + indexes: [event.anonymous_id], + blobs: [event.os, event.provider, event.version], + doubles: [1], + }); + + // Mark event as processed + await env.KV.put(eventKey, '1', { expirationTtl: 86400 }); + + // Update rate limit counter + await env.KV.put(rateLimitKey, (parseInt(count || 0) + 1).toString(), + { expirationTtl: 3600 }); + + return new Response('OK', { status: 200 }); + } +}; +``` + +**Query Example (DAU):** +```sql +SELECT COUNT(DISTINCT index1) as dau +FROM analytics_dataset +WHERE timestamp >= NOW() - INTERVAL '1' DAY; +``` + +--- + +### Runner-Up: Self-Hosted FastAPI + +**When to Choose:** + +- You have DevOps experience and don't mind maintenance +- You want zero vendor lock-in +- You prefer Python for everything +- You plan to heavily customize analytics logic +- You want to run locally for development + +**Implementation Path:** + +```python +# Simplified FastAPI example +from fastapi import FastAPI, HTTPException +from slowapi import Limiter +from slowapi.util import get_remote_address +import sqlite3 + +app = FastAPI() +limiter = Limiter(key_func=get_remote_address) + +@app.post("/event") +@limiter.limit("100/hour") +async def record_event(event: TelemetryEvent): + # Idempotency check + conn = sqlite3.connect('telemetry.db') + cursor = conn.cursor() + + try: + cursor.execute( + "INSERT INTO events (event_id, anonymous_id, os, provider, version, timestamp) " + "VALUES (?, ?, ?, ?, ?, ?)", + (event.event_id, event.anonymous_id, event.os, event.provider, + event.version, event.timestamp) + ) + conn.commit() + except sqlite3.IntegrityError: + # Event already exists (duplicate event_id) + return {"status": "duplicate"} + finally: + conn.close() + + return {"status": "ok"} +``` + +**Total Cost:** $4-6/month (Hetzner CX11) +**Setup Time:** 4-8 hours +**Maintenance:** 2-4 hours/month + +--- + +## Implementation Recommendations + +### Phase 1: MVP (Cloudflare Workers) + +1. **Week 1: Setup** + - Create Cloudflare account + - Deploy basic Worker with D1 database + - Implement rate limiting (per anonymous_id + IP) + - Add idempotency handling (event_id deduplication) + +2. **Week 2: Client Integration** + - Python client library for event submission + - Opt-in configuration handling + - Retry logic with exponential backoff + - Local event queue for offline scenarios + +3. **Week 3: Analytics** + - Set up Analytics Engine for time-series data + - Create basic queries (DAU, retention, provider usage) + - Workers Observability dashboard configuration + - Alert on error rates + +4. **Week 4: Testing & Launch** + - Load testing (simulate 1000 users) + - Privacy audit (ensure no PII leakage) + - Documentation (opt-in/opt-out process) + - Soft launch with monitoring + +### Phase 2: Scale & Iterate (Month 2-3) + +- Add custom queries for specific insights +- Implement data export for long-term archival +- Consider self-hosted backup if vendor lock-in becomes concern +- Evaluate migration to PostHog if product analytics needs expand + +### Migration Path (If Needed) + +If Cloudflare Workers doesn't meet future needs: + +1. Export event data (JSON/CSV via Workers Observability API) +2. Import to new system (PostgreSQL, ClickHouse, PostHog) +3. Update client to point to new endpoint +4. Run both systems in parallel during transition + +**Key:** Own your event schema and data from day one. + +--- + +## Privacy & Compliance Checklist + +For any chosen backend: + +- [ ] IP anonymization (hash or truncate) +- [ ] No collection of PII (names, emails, device IDs) +- [ ] Clear opt-in mechanism (disabled by default) +- [ ] Easy opt-out (`voicemode telemetry disable`) +- [ ] Data retention policy (30-90 days for MVP) +- [ ] Transparency documentation (what we collect, why, how) +- [ ] Anonymous ID generation (UUID4, not tied to user identity) +- [ ] Secure transmission (HTTPS only) +- [ ] Audit logging (what events were sent, when) +- [ ] Data export capability (user can request their data) + +--- + +## Cost Projections + +### 1,000 Users (10 events/day/user) +- Events/month: 300,000 + +| Backend | Monthly Cost | +|---------|--------------| +| Cloudflare Workers | $0-5 (free tier) | +| Self-Hosted FastAPI | $4-12 | +| AWS S3 + Athena | $0.50-2 | +| Google Sheets | $0 (breaks at scale) | +| PostHog CE | $12-30 | +| Plausible CE | $4-20 | + +### 10,000 Users (10 events/day/user) +- Events/month: 3,000,000 + +| Backend | Monthly Cost | +|---------|--------------| +| Cloudflare Workers | $5-15 | +| Self-Hosted FastAPI | $12-30 (upgrade VPS) | +| AWS S3 + Athena | $2-10 | +| Google Sheets | Not viable | +| PostHog CE | $50-100 (multi-instance) | +| Plausible CE | $20-50 (upgrade VPS) | + +### 100,000 Users (10 events/day/user) +- Events/month: 30,000,000 + +| Backend | Monthly Cost | +|---------|--------------| +| Cloudflare Workers | $50-100 | +| Self-Hosted FastAPI | $100-200 (multi-instance + load balancer) | +| AWS S3 + Athena | $20-50 | +| PostHog Cloud | $450+ (migrate from CE) | +| Plausible Cloud | $69+ (migrate from CE) | + +**Note:** At 100k users, all self-hosted options become operationally expensive (engineering time). + +--- + +## Sources + +This research is based on current (2025) documentation and best practices: + +- [Cloudflare Workers Observability](https://blog.cloudflare.com/introducing-workers-observability-logs-metrics-and-queries-all-in-one-place/) +- [Cloudflare Workers Analytics Engine](https://blog.cloudflare.com/workers-analytics-engine/) +- [PostHog Self-Hosted Documentation](https://posthog.com/docs/self-host) +- [Plausible Self-Hosted Guide](https://plausible.io/self-hosted-web-analytics) +- [AWS Athena Pricing](https://aws.amazon.com/athena/pricing/) +- [CLI Telemetry Best Practices](https://marcon.me/articles/cli-telemetry-best-practices/) +- [FastAPI Rate Limiting with SlowAPI](https://github.com/laurentS/slowapi) +- [Google Sheets API Limits](https://hevodata.com/learn/google-sheets-webhooks-integration/) + +--- + +## Next Steps + +1. **Decision:** Review this report and select backend (recommend: Cloudflare Workers) +2. **Prototype:** Build proof-of-concept Worker in 1-2 days +3. **Schema:** Define telemetry event schema (see tel-003-event-schema.md) +4. **Client:** Implement Python client library for event submission +5. **Privacy:** Draft opt-in documentation and privacy policy +6. **Testing:** Load test with simulated traffic +7. **Launch:** Soft launch with monitoring and feedback collection + +**Timeline:** 2-3 weeks to production-ready MVP. diff --git a/tests/manual/test_telemetry_environment_detection.py b/tests/manual/test_telemetry_environment_detection.py new file mode 100755 index 00000000..b309ae3e --- /dev/null +++ b/tests/manual/test_telemetry_environment_detection.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +""" +Test script for tel-004: Environment Detection + +This script demonstrates the environment detection functions added to +voice_mode/config.py for the VoiceMode telemetry system. +""" + +import json +from voice_mode.config import ( + get_os_type, + get_installation_method, + get_mcp_host, + get_execution_source, + get_environment_info +) + + +def main(): + print("=" * 70) + print("VoiceMode Telemetry - Environment Detection Test (tel-004)") + print("=" * 70) + print() + + # Test individual detection functions + print("1. Operating System Detection:") + print(f" get_os_type() = {get_os_type()!r}") + print() + + print("2. Installation Method Detection:") + print(f" get_installation_method() = {get_installation_method()!r}") + print(" Possible values: 'dev', 'uv', 'pip', 'unknown'") + print() + + print("3. MCP Host Detection:") + mcp_host = get_mcp_host() + print(f" get_mcp_host() = {mcp_host!r}") + print(" Known hosts: claude-code, cursor, cline, electron-app") + print() + + print("4. Execution Source Detection:") + print(f" get_execution_source() = {get_execution_source()!r}") + print(" Possible values: 'mcp', 'cli'") + print() + + print("5. Complete Environment Info:") + env_info = get_environment_info() + print(" get_environment_info() =") + print(" " + json.dumps(env_info, indent=2).replace("\n", "\n ")) + print() + + # Verify caching works + print("6. Verifying Lazy Caching:") + info1 = get_environment_info() + info2 = get_environment_info() + if info1 == info2: + print(" ✓ Caching works - identical results on repeated calls") + else: + print(" ✗ Caching failed - results differ!") + return False + print() + + print("=" * 70) + print("All environment detection functions working correctly!") + print("=" * 70) + + return True + + +if __name__ == "__main__": + success = main() + exit(0 if success else 1) diff --git a/voice_mode/cli.py b/voice_mode/cli.py index fdfe2719..77a49fd0 100644 --- a/voice_mode/cli.py +++ b/voice_mode/cli.py @@ -68,6 +68,12 @@ def voice_mode_main_cli(ctx, debug, tools_enabled, tools_disabled): if tools_disabled: os.environ['VOICEMODE_TOOLS_DISABLED'] = tools_disabled + # Check if we should prompt for telemetry consent (only for interactive CLI commands) + # Skip for MCP server mode (when no subcommand) as MCP uses stdio and can't prompt + if ctx.invoked_subcommand is not None: + from voice_mode.utils.telemetry_prompt import maybe_prompt_for_telemetry + maybe_prompt_for_telemetry() + if ctx.invoked_subcommand is None: # No subcommand - run MCP server # Note: warnings are already suppressed at module level unless debug is enabled diff --git a/voice_mode/config.py b/voice_mode/config.py index 8ff7688d..9a091a83 100644 --- a/voice_mode/config.py +++ b/voice_mode/config.py @@ -9,6 +9,7 @@ import logging import asyncio import subprocess +import uuid from pathlib import Path from typing import Dict, Optional from datetime import datetime @@ -796,6 +797,338 @@ def trace_calls(frame, event, arg): return logger +# ==================== TELEMETRY CONFIGURATION ==================== + +# Telemetry opt-in/opt-out configuration +# Precedence (highest to lowest): +# 1. DO_NOT_TRACK (any value) → telemetry DISABLED (universal opt-out) +# 2. VOICEMODE_TELEMETRY=true → enabled +# 3. VOICEMODE_TELEMETRY=false → disabled +# 4. VOICEMODE_TELEMETRY=ask → need to prompt user (default) + +# DO_NOT_TRACK - universal opt-out standard (https://consoledonottrack.com/) +# If set to any value, telemetry is disabled +DO_NOT_TRACK = os.getenv("DO_NOT_TRACK") + +# VOICEMODE_TELEMETRY - tool-specific telemetry control +# Values: "ask" (default), "true", "false" +VOICEMODE_TELEMETRY = os.getenv("VOICEMODE_TELEMETRY", "ask").lower() + +# Validate VOICEMODE_TELEMETRY value +if VOICEMODE_TELEMETRY not in ("ask", "true", "false", "1", "0", "yes", "no", "on", "off"): + VOICEMODE_TELEMETRY = "ask" + +# Normalize boolean-style values to "true"/"false" +if VOICEMODE_TELEMETRY in ("1", "yes", "on"): + VOICEMODE_TELEMETRY = "true" +elif VOICEMODE_TELEMETRY in ("0", "no", "off"): + VOICEMODE_TELEMETRY = "false" + +# VOICEMODE_TELEMETRY_ENDPOINT - backend URL for telemetry data +# Default endpoint (temporary - will change to custom domain before release) +# Users can override with VOICEMODE_TELEMETRY_ENDPOINT environment variable +_DEFAULT_TELEMETRY_ENDPOINT = "https://voicemode-telemetry.late-limit-5e4c.workers.dev/telemetry" +VOICEMODE_TELEMETRY_ENDPOINT = os.getenv("VOICEMODE_TELEMETRY_ENDPOINT", _DEFAULT_TELEMETRY_ENDPOINT) + + +def is_telemetry_enabled() -> bool: + """Determine if telemetry should be enabled based on environment and configuration. + + Precedence (highest to lowest): + 1. DO_NOT_TRACK (any value) → telemetry DISABLED (universal opt-out) + 2. VOICEMODE_TELEMETRY=true → enabled + 3. VOICEMODE_TELEMETRY=false → disabled + 4. VOICEMODE_TELEMETRY=ask → disabled (requires explicit opt-in) + + Returns: + True if telemetry is enabled, False otherwise + """ + # 1. Check DO_NOT_TRACK (any value means opt-out) + if DO_NOT_TRACK is not None: + return False + + # 2. Check VOICEMODE_TELEMETRY + if VOICEMODE_TELEMETRY == "true": + return True + elif VOICEMODE_TELEMETRY == "false": + return False + elif VOICEMODE_TELEMETRY == "ask": + # "ask" means telemetry is disabled until user explicitly opts in + # The opt-in prompt will be handled by the UX layer (tel-007) + return False + + # Default: disabled (privacy-first) + return False + + +def get_telemetry_status() -> Dict[str, any]: + """Get detailed telemetry status information. + + Returns: + Dictionary with telemetry status details: + - enabled: bool - Whether telemetry is currently enabled + - reason: str - Why telemetry is enabled/disabled + - do_not_track: bool - Whether DO_NOT_TRACK is set + - voicemode_telemetry: str - Value of VOICEMODE_TELEMETRY + - endpoint: str | None - Telemetry endpoint URL + - telemetry_id: str - Anonymous telemetry ID + """ + enabled = is_telemetry_enabled() + + # Determine reason + if DO_NOT_TRACK is not None: + reason = "DO_NOT_TRACK environment variable is set (universal opt-out)" + elif VOICEMODE_TELEMETRY == "true": + reason = "VOICEMODE_TELEMETRY=true (explicit opt-in)" + elif VOICEMODE_TELEMETRY == "false": + reason = "VOICEMODE_TELEMETRY=false (explicit opt-out)" + elif VOICEMODE_TELEMETRY == "ask": + reason = "VOICEMODE_TELEMETRY=ask (user has not been prompted yet)" + else: + reason = "Default privacy-first policy (disabled)" + + return { + "enabled": enabled, + "reason": reason, + "do_not_track": DO_NOT_TRACK is not None, + "voicemode_telemetry": VOICEMODE_TELEMETRY, + "endpoint": VOICEMODE_TELEMETRY_ENDPOINT, + "telemetry_id": get_telemetry_id() if enabled else None + } + + +def get_telemetry_id() -> str: + """Get or create the anonymous telemetry ID. + + The telemetry ID is a permanent UUID that uniquely identifies this VoiceMode + installation. It is generated once on first use and stored in ~/.voicemode/telemetry_id. + + This ID is used for anonymous usage analytics and does not contain any personal + information. It allows aggregating usage patterns while maintaining user privacy. + + Returns: + UUID string identifying this installation + """ + telemetry_id_path = BASE_DIR / "telemetry_id" + + # Check if telemetry ID already exists + if telemetry_id_path.exists(): + try: + telemetry_id = telemetry_id_path.read_text().strip() + # Validate it's a valid UUID + uuid.UUID(telemetry_id) + return telemetry_id + except (ValueError, OSError): + # Invalid UUID or read error, regenerate + pass + + # Generate new telemetry ID + telemetry_id = str(uuid.uuid4()) + + try: + # Ensure base directory exists + BASE_DIR.mkdir(parents=True, exist_ok=True) + + # Write telemetry ID to file with secure permissions + telemetry_id_path.write_text(telemetry_id) + os.chmod(telemetry_id_path, 0o600) + except OSError: + # Continue even if we can't save - telemetry isn't critical + pass + + return telemetry_id + + +# Environment detection - lazy cached values +_environment_cache: Dict[str, Optional[str]] = {} + + +def get_os_type() -> str: + """Detect the operating system type. + + Returns: + OS type string: "Darwin" (macOS), "Linux", "Windows", or "Unknown" + """ + if "os_type" not in _environment_cache: + import platform + _environment_cache["os_type"] = platform.system() + return _environment_cache["os_type"] + + +def get_installation_method() -> str: + """Detect how VoiceMode was installed. + + Checks for: + - Development mode: Running from source with editable install + - UV: Installed with uv package manager + - Pip: Installed with pip + + Returns: + Installation method: "dev", "uv", "pip", or "unknown" + """ + if "install_method" not in _environment_cache: + import sys + + method = "unknown" + + # Check if running from editable/development install + # In dev mode, __file__ points to source directory + try: + module_file = Path(__file__).resolve() + + # Check if we're in a git repository (dev mode indicator) + try: + git_root = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + capture_output=True, + text=True, + cwd=module_file.parent, + timeout=1 + ) + if git_root.returncode == 0: + # Running from git repo suggests dev mode + method = "dev" + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + + # If not dev, check for package manager indicators + if method == "unknown": + # Check sys.prefix for uv indicators + prefix = Path(sys.prefix) + + # UV typically uses .venv or creates virtual envs with specific structure + if (prefix / ".uv").exists() or ".uv" in str(prefix): + method = "uv" + # Check for pip install + elif (prefix / "lib").exists(): + method = "pip" + + except Exception: + # If detection fails, fall back to unknown + pass + + _environment_cache["install_method"] = method + + return _environment_cache["install_method"] + + +def get_mcp_host() -> Optional[str]: + """Detect the MCP host application. + + Checks environment variables and process tree to identify: + - Claude Code (CLAUDE_CODE_*) + - Cursor (CURSOR_*) + - Cline (CLINE_*) + - Other MCP hosts + + Returns: + MCP host name or None if unknown + """ + if "mcp_host" not in _environment_cache: + host = None + + # Check environment variables for known hosts + if any(key.startswith("CLAUDE_CODE_") for key in os.environ): + host = "claude-code" + elif any(key.startswith("CURSOR_") for key in os.environ): + host = "cursor" + elif any(key.startswith("CLINE_") for key in os.environ): + host = "cline" + elif "MCP_HOST" in os.environ: + # Generic MCP_HOST variable + host = os.environ["MCP_HOST"] + + # If no env vars found, check process tree + if host is None: + try: + # Get parent process name + ppid = os.getppid() + proc_result = subprocess.run( + ["ps", "-p", str(ppid), "-o", "comm="], + capture_output=True, + text=True, + timeout=1 + ) + if proc_result.returncode == 0: + parent_name = proc_result.stdout.strip().lower() + + # Check for known MCP host process names + if "claude" in parent_name or "code" in parent_name: + host = "claude-code" + elif "cursor" in parent_name: + host = "cursor" + elif "cline" in parent_name: + host = "cline" + elif "node" in parent_name or "electron" in parent_name: + # Generic Node/Electron app - likely an MCP host + host = "electron-app" + + except (subprocess.TimeoutExpired, FileNotFoundError, OSError): + pass + + _environment_cache["mcp_host"] = host + + return _environment_cache["mcp_host"] + + +def get_execution_source() -> str: + """Detect whether running as MCP server or CLI command. + + Checks: + - stdin/stdout to detect MCP stdio transport + - Command line arguments + - Parent process + + Returns: + Execution source: "mcp" or "cli" + """ + if "exec_source" not in _environment_cache: + import sys + + source = "cli" # Default to CLI + + # Check if stdin is a pipe (MCP servers use stdio transport) + try: + import stat + mode = os.fstat(sys.stdin.fileno()).st_mode + if stat.S_ISFIFO(mode): + # stdin is a pipe - likely MCP mode + source = "mcp" + except (OSError, AttributeError): + pass + + # Check command line arguments + if source == "cli" and len(sys.argv) > 0: + # If invoked with server.py or as a module, it's MCP + if "server.py" in sys.argv[0] or "-m" in sys.argv: + source = "mcp" + + # Check for MCP-specific environment variables + if "MCP_SERVER" in os.environ or "MCP_TRANSPORT" in os.environ: + source = "mcp" + + _environment_cache["exec_source"] = source + + return _environment_cache["exec_source"] + + +def get_environment_info() -> Dict[str, Optional[str]]: + """Get all environment detection information. + + Returns: + Dictionary with environment details: + - os_type: Operating system (Darwin, Linux, Windows) + - install_method: How VoiceMode was installed (dev, uv, pip) + - mcp_host: MCP host application (claude-code, cursor, etc.) + - exec_source: Execution context (mcp, cli) + """ + return { + "os_type": get_os_type(), + "install_method": get_installation_method(), + "mcp_host": get_mcp_host(), + "exec_source": get_execution_source() + } + # ==================== DIRECTORY INITIALIZATION ==================== def initialize_directories(): @@ -1046,6 +1379,10 @@ def protect_stderr(): # Set up logger logger = setup_logging() +# Initialize telemetry ID +TELEMETRY_ID = get_telemetry_id() +logger.debug(f"Telemetry ID: {TELEMETRY_ID}") + # Log any format validation warnings if 'AUDIO_FORMAT' in locals() and '_invalid_audio_format' in locals(): logger.warning(f"Unsupported audio format '{_invalid_audio_format}', falling back to 'pcm'") diff --git a/voice_mode/resources/telemetry.py b/voice_mode/resources/telemetry.py new file mode 100644 index 00000000..61fc743e --- /dev/null +++ b/voice_mode/resources/telemetry.py @@ -0,0 +1,176 @@ +"""MCP resources for telemetry status and information.""" + +from ..server import mcp +from ..config import ( + logger, + get_telemetry_status, + get_telemetry_id, + VOICEMODE_TELEMETRY, + DO_NOT_TRACK, + VOICEMODE_TELEMETRY_ENDPOINT, +) + + +@mcp.resource("voicemode://telemetry/status") +async def telemetry_status() -> str: + """ + Telemetry opt-in status and information. + + Shows: + - Current telemetry enabled/disabled status + - Reason for current status (DO_NOT_TRACK, explicit opt-in/out, or pending prompt) + - What data is collected (anonymized usage stats) + - How to opt-in or opt-out + - Telemetry ID (if enabled) + - Endpoint URL (if configured) + + Use this resource to: + - Check if user needs to be prompted for telemetry consent + - Explain telemetry to users + - Show current telemetry configuration + """ + status = get_telemetry_status() + + lines = [] + lines.append("VoiceMode Telemetry Status") + lines.append("=" * 80) + lines.append("") + + # Current status + status_emoji = "✅" if status["enabled"] else "❌" + lines.append(f"Status: {status_emoji} {'ENABLED' if status['enabled'] else 'DISABLED'}") + lines.append(f"Reason: {status['reason']}") + lines.append("") + + # If user needs to be prompted + if VOICEMODE_TELEMETRY == "ask": + lines.append("⚠️ USER CONSENT REQUIRED") + lines.append("") + lines.append("The user has not yet been asked about telemetry collection.") + lines.append("Please present the opt-in information below and use the") + lines.append("telemetry_set_preference tool to record their choice.") + lines.append("") + + # What is collected + lines.append("📊 What Data is Collected") + lines.append("-" * 80) + lines.append("") + lines.append("VoiceMode collects anonymous usage statistics to help improve the tool:") + lines.append("") + lines.append(" • Session counts and durations (binned: <1min, 1-5min, 5-10min, etc.)") + lines.append(" • Number of voice exchanges per session (binned: 0, 1-5, 6-10, etc.)") + lines.append(" • TTS/STT provider usage (openai, kokoro, whisper-local, other)") + lines.append(" • Transport type (local microphone vs LiveKit)") + lines.append(" • Success/failure rates") + lines.append(" • Error types (anonymized, no personal info)") + lines.append(" • Operating system type") + lines.append(" • Installation method (dev, uv, pip)") + lines.append(" • Execution source (MCP server, CLI)") + lines.append("") + lines.append("Privacy protections:") + lines.append(" • All data is anonymized using a random UUID") + lines.append(" • No personal information, file paths, or API keys are collected") + lines.append(" • Durations and counts are binned to prevent identification") + lines.append(" • Error messages are sanitized to remove specific details") + lines.append(" • Data is opt-in only - disabled by default") + lines.append("") + + # How to opt-in/out + lines.append("🔧 How to Control Telemetry") + lines.append("-" * 80) + lines.append("") + lines.append("Via MCP tool (for LLM use):") + lines.append(" • Use telemetry_set_preference(enabled=true) to opt-in") + lines.append(" • Use telemetry_set_preference(enabled=false) to opt-out") + lines.append("") + lines.append("Via environment variable:") + lines.append(" • Set VOICEMODE_TELEMETRY=true to enable") + lines.append(" • Set VOICEMODE_TELEMETRY=false to disable") + lines.append(" • Set DO_NOT_TRACK=1 to disable all telemetry (universal opt-out)") + lines.append("") + lines.append("Via configuration file (~/.voicemode/voicemode.env):") + lines.append(" • Add VOICEMODE_TELEMETRY=true to enable") + lines.append(" • Add VOICEMODE_TELEMETRY=false to disable") + lines.append("") + + # Current configuration + lines.append("⚙️ Current Configuration") + lines.append("-" * 80) + lines.append("") + if status["do_not_track"]: + lines.append(f" DO_NOT_TRACK: Set (telemetry disabled by universal opt-out)") + else: + lines.append(f" DO_NOT_TRACK: Not set") + lines.append(f" VOICEMODE_TELEMETRY: {status['voicemode_telemetry']}") + + if status["enabled"]: + lines.append(f" Telemetry ID: {status['telemetry_id']}") + else: + lines.append(f" Telemetry ID: (not shown - telemetry disabled)") + + if status["endpoint"]: + lines.append(f" Endpoint: {status['endpoint']}") + else: + lines.append(f" Endpoint: Not configured (telemetry data will be queued locally)") + lines.append("") + + # More information + lines.append("💡 More Information") + lines.append("-" * 80) + lines.append("") + lines.append("For more details about privacy and data collection:") + lines.append(" • See the Privacy section in the VoiceMode documentation") + lines.append(" • Review the source code at voice_mode/telemetry/") + lines.append(" • Contact the maintainers with questions or concerns") + + return "\n".join(lines) + + +@mcp.resource("voicemode://telemetry/opt-in-prompt") +async def telemetry_opt_in_prompt() -> str: + """ + User-friendly telemetry opt-in prompt text. + + This resource provides a concise, friendly prompt that LLMs can use to ask + users about telemetry collection. It's designed to be: + - Clear about what is collected + - Honest about privacy protections + - Easy to understand + - Not pushy or manipulative + + Use this when VOICEMODE_TELEMETRY=ask to present the opt-in choice to users. + """ + lines = [] + lines.append("VoiceMode Telemetry") + lines.append("=" * 60) + lines.append("") + lines.append("VoiceMode would like to collect anonymous usage statistics") + lines.append("to help improve the tool.") + lines.append("") + lines.append("What we collect:") + lines.append(" • Session counts and durations (binned for privacy)") + lines.append(" • Voice exchanges per session") + lines.append(" • TTS/STT provider usage (openai, kokoro, whisper)") + lines.append(" • Success/failure rates") + lines.append(" • Anonymized error types") + lines.append("") + lines.append("What we DON'T collect:") + lines.append(" • Your conversations or voice recordings") + lines.append(" • Personal information or file paths") + lines.append(" • API keys or credentials") + lines.append(" • Anything that could identify you") + lines.append("") + lines.append("Privacy protections:") + lines.append(" • All data is anonymized with a random UUID") + lines.append(" • Numbers are binned to prevent identification") + lines.append(" • You can opt-out anytime") + lines.append(" • Set DO_NOT_TRACK=1 to disable universally") + lines.append("") + lines.append("Would you like to enable telemetry?") + lines.append("") + lines.append(" [Yes] - Help improve VoiceMode with anonymous stats") + lines.append(" [No] - Don't collect any usage data") + lines.append("") + lines.append("(You can change this later in ~/.voicemode/voicemode.env)") + + return "\n".join(lines) diff --git a/voice_mode/server.py b/voice_mode/server.py index 9c359f75..13ba146e 100644 --- a/voice_mode/server.py +++ b/voice_mode/server.py @@ -78,7 +78,15 @@ def main(): logger.info(f"Event logging enabled, writing to {EVENT_LOG_DIR}") else: logger.info("Event logging disabled") - + + # Telemetry: Send usage data if enabled (runs in background, non-blocking) + try: + from .telemetry import maybe_send_telemetry_background + maybe_send_telemetry_background() + except Exception as e: + # Telemetry should never block server startup + logger.debug(f"Telemetry initialization skipped: {e}") + # Run the server mcp.run(transport="stdio") diff --git a/voice_mode/telemetry/README.md b/voice_mode/telemetry/README.md new file mode 100644 index 00000000..dd2132a8 --- /dev/null +++ b/voice_mode/telemetry/README.md @@ -0,0 +1,298 @@ +# VoiceMode Telemetry Module + +Anonymous, opt-in telemetry system for understanding VoiceMode usage patterns while respecting user privacy. + +## Overview + +The telemetry module collects privacy-respecting analytics from VoiceMode usage to help improve the product. All data collection is: + +- **Anonymous**: Uses random UUID with no connection to user identity +- **Opt-in**: Disabled by default, requires explicit user consent +- **Privacy-preserving**: Data is binned and anonymized before transmission +- **Transparent**: Clear documentation of what is and isn't collected + +## Module Structure + +``` +voice_mode/telemetry/ +├── __init__.py # Public API exports +├── collector.py # Data collection from logs +├── privacy.py # Anonymization and binning functions +└── client.py # HTTP transmission client +``` + +## Components + +### TelemetryCollector + +Analyzes VoiceMode event logs and conversation logs to extract usage metrics. + +**Key Methods:** +- `collect_session_data(start_date, end_date)` - Aggregate session statistics +- `collect_environment_data()` - System and installation information +- `collect_telemetry_event()` - Complete telemetry payload + +**Data Collected:** +- Session counts and duration bins (never exact durations) +- Exchanges per session (binned for privacy) +- TTS/STT provider usage (openai, kokoro, whisper-local, other) +- Transport type (local, livekit) +- Success/failure rates +- Anonymized error types (no stack traces or user data) + +**Data NOT Collected:** +- No user names, emails, or personal information +- No file paths (anonymized to ~/Code level) +- No conversation content +- No exact timestamps (binned to daily or hourly) +- No IP addresses beyond what HTTP protocol requires + +### Privacy Functions + +**Duration Binning:** +```python +from voice_mode.telemetry import bin_duration + +duration = bin_duration(180) # "1-5min" +``` + +Bins: +- `<1min` - Under 1 minute +- `1-5min` - 1 to 5 minutes +- `5-10min` - 5 to 10 minutes +- `10-20min` - 10 to 20 minutes +- `20-60min` - 20 to 60 minutes +- `>60min` - Over 60 minutes + +**Size Binning:** +```python +from voice_mode.telemetry import bin_size + +size = bin_size(75 * 1024) # "50-100KB" +``` + +Bins: +- `<50KB` - Under 50 KB +- `50-100KB` - 50 to 100 KB +- `100-200KB` - 100 to 200 KB +- `200-500KB` - 200 to 500 KB +- `>500KB` - Over 500 KB + +**Path Anonymization:** +```python +from voice_mode.telemetry import anonymize_path + +path = anonymize_path("/home/user/Code/project/file.py") # "~/Code" +``` + +Removes user-specific information from paths while preserving general structure. + +**Version Sanitization:** +```python +from voice_mode.telemetry.privacy import sanitize_version_string + +version = sanitize_version_string("2.17.2+local.dev.abc123") # "2.17.2" +``` + +Removes build hashes and local identifiers from version strings. + +### TelemetryClient + +HTTP client for transmitting telemetry events to the backend. + +**Features:** +- Deterministic event ID generation (prevents duplicates) +- Retry logic with exponential backoff +- Offline queueing in `~/.voicemode/telemetry_queue/` +- Rate limit handling (429 responses) +- Automatic cleanup of old queued events + +**Usage:** +```python +from voice_mode.telemetry import TelemetryClient, TelemetryCollector + +# Create client (endpoint URL from config) +client = TelemetryClient(endpoint_url="https://telemetry.example.com/v1/events") + +# Collect and send event +collector = TelemetryCollector() +event = collector.collect_telemetry_event() +success = client.send_event(event) + +# Send queued events (from previous offline periods) +sent_count = client.send_queued_events() + +# Clean up old events (default: 7 days) +cleared = client.clear_old_queued_events(max_age_days=7) +``` + +## Privacy Guarantees + +### What We Collect + +1. **Anonymous Installation ID** + - Random UUID generated on first run + - No connection to user identity + - Stored in `~/.voicemode/telemetry_id` + +2. **Environment Information** + - OS type (Linux, Darwin, Windows) + - Installation method (dev, uv, pip) + - MCP host (claude-code, cursor, cline, etc.) + - Execution source (mcp, cli) + - VoiceMode version (sanitized) + +3. **Usage Metrics (Binned)** + - Session counts and duration bins + - Exchange counts per session (binned) + - Provider usage frequencies + - Transport type usage + - Success/failure rates + - Error type frequencies (anonymized) + +### What We DON'T Collect + +- ❌ User names, emails, or personal information +- ❌ File paths (anonymized to `~/Code` level) +- ❌ Conversation content or transcriptions +- ❌ Exact timestamps (binned to daily) +- ❌ IP addresses (beyond HTTP requirements) +- ❌ Device identifiers or hardware info +- ❌ API keys or credentials +- ❌ Project names or directory structures + +### Data Retention + +- Events queued locally are kept for 7 days maximum +- Backend retention policy: 90 days (configurable) +- No permanent storage of raw events + +## Example Telemetry Event + +```json +{ + "event_id": "3d3d1ffcab7048af", + "telemetry_id": "e85850d8-3ca6-4a78-952a-d0f195738b0a", + "timestamp": "2025-12-14T03:00:00+00:00", + "environment": { + "os_type": "Linux", + "install_method": "dev", + "mcp_host": "claude-code", + "exec_source": "cli", + "version": "2.17.2" + }, + "usage": { + "total_sessions": 42, + "duration_distribution": { + "<1min": 10, + "1-5min": 20, + "5-10min": 8, + "10-20min": 3, + "20-60min": 1 + }, + "exchanges_per_session": { + "0": 2, + "1-5": 30, + "6-10": 8, + "11-20": 2 + }, + "transport_usage": { + "local": 40, + "livekit": 2 + }, + "tts_provider_usage": { + "kokoro": 35, + "openai": 7 + }, + "stt_provider_usage": { + "whisper-local": 38, + "openai": 4 + }, + "success_rate": 95.2, + "total_operations": 42, + "error_types": { + "ConnectionError": 2 + } + } +} +``` + +## Integration with Config + +The telemetry module integrates with VoiceMode configuration: + +```python +from voice_mode import config + +# Telemetry ID (generated on first run) +telemetry_id = config.TELEMETRY_ID + +# Environment detection +env_info = config.get_environment_info() +# Returns: {os_type, install_method, mcp_host, exec_source} +``` + +## Testing + +Run the test script to verify telemetry functionality: + +```bash +cd /path/to/voicemode +python3 test_telemetry.py +``` + +The test script demonstrates: +- Privacy function operation (binning, anonymization) +- Data collection from existing logs +- Telemetry event generation +- Client queue and transmission functionality + +## Future Enhancements + +Planned for upcoming features (tel-006 through tel-010): + +1. **Configuration** (tel-006) + - `VOICEMODE_TELEMETRY` setting (ask/true/false) + - `DO_NOT_TRACK` environment variable support + - Endpoint URL configuration + +2. **Opt-in UX** (tel-007) + - CLI prompts for opt-in + - MCP resources for LLM-assisted consent + - Tools for preference management + +3. **Backend** (tel-008) + - Cloudflare Workers endpoint + - Rate limiting per anonymous ID + - Event validation and storage + +4. **Testing** (tel-009) + - Dogfooding with real usage + - Privacy audit + - Load testing + +5. **Documentation** (tel-010) + - Privacy policy + - Opt-out instructions + - Transparency report + +## Compliance + +The telemetry system is designed to comply with: + +- **GDPR**: Anonymous data, opt-in consent, right to opt-out +- **CCPA**: No sale of personal information (we don't collect any) +- **DO_NOT_TRACK**: Respects DNT header and environment variable + +## Questions? + +For questions about telemetry, privacy, or data collection: + +1. Review this documentation +2. Check the main README.md privacy section (tel-010) +3. Open an issue on GitHub +4. Opt out if uncertain: `voicemode telemetry disable` (tel-007) + +## License + +Same as VoiceMode project (MIT). diff --git a/voice_mode/telemetry/__init__.py b/voice_mode/telemetry/__init__.py new file mode 100644 index 00000000..328f06ee --- /dev/null +++ b/voice_mode/telemetry/__init__.py @@ -0,0 +1,44 @@ +""" +VoiceMode Telemetry Module + +Anonymous, opt-in telemetry system for understanding VoiceMode usage patterns. + +This module collects privacy-respecting analytics from VoiceMode usage including: +- Session counts and durations (binned for privacy) +- Exchange counts per session +- TTS/STT provider usage +- Success/failure rates +- Transport type (local/livekit) + +All data is anonymized, binned to prevent identification, and only sent with +explicit user opt-in. The telemetry ID is a random UUID with no connection to +user identity. +""" + +from voice_mode.telemetry.collector import TelemetryCollector +from voice_mode.telemetry.privacy import ( + bin_duration, + bin_size, + anonymize_path, + DurationBin, + SizeBin +) +from voice_mode.telemetry.client import TelemetryClient +from voice_mode.telemetry.sender import ( + maybe_send_telemetry_background, + maybe_send_telemetry_async, + should_send_telemetry, +) + +__all__ = [ + 'TelemetryCollector', + 'TelemetryClient', + 'bin_duration', + 'bin_size', + 'anonymize_path', + 'DurationBin', + 'SizeBin', + 'maybe_send_telemetry_background', + 'maybe_send_telemetry_async', + 'should_send_telemetry', +] diff --git a/voice_mode/telemetry/client.py b/voice_mode/telemetry/client.py new file mode 100644 index 00000000..ca1b8266 --- /dev/null +++ b/voice_mode/telemetry/client.py @@ -0,0 +1,271 @@ +""" +Telemetry HTTP client. + +Handles transmission of telemetry events to the backend endpoint with +retry logic, rate limiting, and offline queueing support. +""" + +import json +import logging +import hashlib +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Any, Optional +from urllib.parse import urljoin + +import httpx + +from voice_mode import config + +logger = logging.getLogger(__name__) + + +class TelemetryClient: + """ + HTTP client for sending telemetry events to backend. + + Features: + - Configurable endpoint URL + - Event deduplication via event IDs + - Retry logic with exponential backoff + - Offline queueing for later transmission + - Request timeout and error handling + """ + + def __init__( + self, + endpoint_url: Optional[str] = None, + timeout: float = 10.0, + max_retries: int = 3, + ): + """ + Initialize the telemetry client. + + Args: + endpoint_url: Backend endpoint URL (will be configurable via env var) + timeout: Request timeout in seconds + max_retries: Maximum number of retry attempts + """ + # Endpoint URL will come from config in tel-006 + # For now, accept as parameter but don't hardcode + self.endpoint_url = endpoint_url + self.timeout = timeout + self.max_retries = max_retries + self.queue_dir = config.BASE_DIR / "telemetry_queue" + + # Ensure queue directory exists + self.queue_dir.mkdir(parents=True, exist_ok=True) + + def _generate_event_id(self, event_data: Dict[str, Any]) -> str: + """ + Generate a unique, deterministic event ID. + + Uses a hash of telemetry_id + timestamp to create an idempotent + event identifier that prevents duplicate submissions. + + Args: + event_data: Event data dictionary + + Returns: + Hexadecimal event ID string + """ + # Create stable hash from telemetry_id and timestamp + id_string = f"{event_data.get('telemetry_id', '')}:{event_data.get('timestamp', '')}" + event_hash = hashlib.sha256(id_string.encode()).hexdigest() + return event_hash[:16] # Use first 16 chars for brevity + + def send_event(self, event_data: Dict[str, Any]) -> bool: + """ + Send a telemetry event to the backend. + + Args: + event_data: Event data dictionary from TelemetryCollector + + Returns: + True if event was sent successfully, False otherwise + """ + if not self.endpoint_url: + logger.debug("No telemetry endpoint configured, skipping send") + return False + + # Generate event ID for idempotency + event_id = self._generate_event_id(event_data) + + # Add event ID to payload + payload = { + "event_id": event_id, + **event_data + } + + # Attempt to send with retries + for attempt in range(self.max_retries): + try: + response = httpx.post( + self.endpoint_url, + json=payload, + timeout=self.timeout, + headers={ + "Content-Type": "application/json", + "User-Agent": f"VoiceMode/{event_data.get('environment', {}).get('version', 'unknown')}", + } + ) + + if response.status_code == 200: + logger.debug(f"Telemetry event sent successfully: {event_id}") + return True + elif response.status_code == 429: + logger.warning("Telemetry rate limit exceeded, will retry later") + self._queue_event(payload) + return False + else: + logger.warning( + f"Telemetry send failed with status {response.status_code}: {response.text}" + ) + + except httpx.TimeoutException: + logger.warning(f"Telemetry send timeout (attempt {attempt + 1}/{self.max_retries})") + except httpx.ConnectError: + logger.debug("Telemetry endpoint not reachable (offline?)") + break # Don't retry connection errors + except Exception as e: + logger.error(f"Unexpected error sending telemetry: {e}") + break + + # If all retries failed, queue for later + self._queue_event(payload) + return False + + def _queue_event(self, event_data: Dict[str, Any]) -> None: + """ + Queue an event for later transmission. + + Stores event in local queue directory for retry when connection + is restored. + + Args: + event_data: Event data to queue + """ + try: + event_id = event_data.get("event_id", "unknown") + queue_file = self.queue_dir / f"event_{event_id}.json" + + with open(queue_file, 'w') as f: + json.dump(event_data, f, indent=2) + + logger.debug(f"Event queued for later transmission: {event_id}") + + except Exception as e: + logger.error(f"Failed to queue telemetry event: {e}") + + def send_queued_events(self) -> int: + """ + Send all queued events. + + Attempts to transmit all events that were previously queued due + to connection failures or rate limiting. + + Returns: + Number of events successfully sent + """ + if not self.endpoint_url: + logger.debug("No telemetry endpoint configured, skipping queued events") + return 0 + + sent_count = 0 + queued_files = list(self.queue_dir.glob("event_*.json")) + + for queue_file in queued_files: + try: + with open(queue_file, 'r') as f: + event_data = json.load(f) + + if self._send_queued_event(event_data): + # Remove from queue on success + queue_file.unlink() + sent_count += 1 + else: + # Keep in queue for later retry + logger.debug(f"Keeping {queue_file.name} in queue") + + except Exception as e: + logger.error(f"Error processing queued event {queue_file}: {e}") + continue + + if sent_count > 0: + logger.info(f"Sent {sent_count} queued telemetry events") + + return sent_count + + def _send_queued_event(self, event_data: Dict[str, Any]) -> bool: + """ + Send a single queued event. + + Args: + event_data: Event data from queue + + Returns: + True if sent successfully, False otherwise + """ + try: + response = httpx.post( + self.endpoint_url, + json=event_data, + timeout=self.timeout, + headers={ + "Content-Type": "application/json", + "User-Agent": f"VoiceMode/{event_data.get('environment', {}).get('version', 'unknown')}", + } + ) + + if response.status_code == 200: + return True + elif response.status_code == 429: + logger.debug("Rate limit still in effect, keeping event queued") + return False + else: + logger.warning( + f"Queued event send failed with status {response.status_code}" + ) + return False + + except Exception as e: + logger.debug(f"Failed to send queued event: {e}") + return False + + def clear_old_queued_events(self, max_age_days: int = 7) -> int: + """ + Clear old queued events to prevent unbounded queue growth. + + Args: + max_age_days: Maximum age in days before events are discarded + + Returns: + Number of events cleared + """ + cleared_count = 0 + cutoff_time = datetime.now(timezone.utc).timestamp() - (max_age_days * 86400) + + for queue_file in self.queue_dir.glob("event_*.json"): + try: + # Check file modification time + if queue_file.stat().st_mtime < cutoff_time: + queue_file.unlink() + cleared_count += 1 + + except Exception as e: + logger.error(f"Error clearing old event {queue_file}: {e}") + continue + + if cleared_count > 0: + logger.info(f"Cleared {cleared_count} old queued telemetry events") + + return cleared_count + + def get_queue_size(self) -> int: + """ + Get the number of events in the queue. + + Returns: + Number of queued events + """ + return len(list(self.queue_dir.glob("event_*.json"))) diff --git a/voice_mode/telemetry/collector.py b/voice_mode/telemetry/collector.py new file mode 100644 index 00000000..c47d34c8 --- /dev/null +++ b/voice_mode/telemetry/collector.py @@ -0,0 +1,391 @@ +""" +Telemetry data collector. + +Gathers telemetry data from existing VoiceMode logs including events and +conversations, applying privacy protections and aggregating into useful metrics. +""" + +import json +import logging +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, List, Optional, Any +from collections import defaultdict + +from voice_mode.telemetry.privacy import ( + bin_duration, + bin_size, + anonymize_path, + anonymize_error_message, + sanitize_version_string, +) +from voice_mode import config + +logger = logging.getLogger(__name__) + + +class TelemetryCollector: + """ + Collects telemetry data from VoiceMode logs. + + Analyzes event logs and conversation logs to extract privacy-preserving + usage metrics including session statistics, provider usage, and error rates. + """ + + def __init__(self, logs_dir: Optional[Path] = None): + """ + Initialize the telemetry collector. + + Args: + logs_dir: Base directory for logs (defaults to config.LOGS_DIR) + """ + self.logs_dir = logs_dir or config.LOGS_DIR + self.events_dir = Path(self.logs_dir) / "events" + self.conversations_dir = Path(self.logs_dir) / "conversations" + + def collect_session_data( + self, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None + ) -> Dict[str, Any]: + """ + Collect aggregated session data from conversation logs. + + Uses conversation logs (exchanges_*.jsonl) which have conversation_id + to accurately track multi-exchange conversations rather than individual + tool calls. + + Args: + start_date: Start date for data collection (inclusive) + end_date: End date for data collection (inclusive) + + Returns: + Dictionary with session statistics including: + - total_conversations: Number of conversations (multi-exchange sessions) + - total_exchanges: Total TTS/STT exchanges across all conversations + - duration_distribution: Binned conversation durations + - exchanges_per_conversation: Distribution of exchanges per conversation + - transport_usage: Counts by transport type (local/livekit) + - provider_usage: TTS and STT provider usage counts + """ + if not self.conversations_dir.exists(): + logger.warning(f"Conversations directory does not exist: {self.conversations_dir}") + return {} + + conversations: Dict[str, Dict] = {} + + # Process conversation log files + for log_file in sorted(self.conversations_dir.glob("exchanges_*.jsonl")): + # Check date filter if provided + if start_date or end_date: + file_date = self._extract_exchange_date_from_filename(log_file.name) + if file_date: + if start_date and file_date < start_date.date(): + continue + if end_date and file_date > end_date.date(): + continue + + try: + with open(log_file, 'r') as f: + for line in f: + if not line.strip(): + continue + + try: + exchange = json.loads(line) + self._process_exchange(exchange, conversations) + except json.JSONDecodeError: + logger.debug(f"Skipping invalid JSON line in {log_file}") + continue + except Exception as e: + logger.error(f"Error processing {log_file}: {e}") + continue + + # Aggregate statistics from conversations + return self._aggregate_conversation_stats(conversations) + + def _process_exchange(self, exchange: Dict, conversations: Dict[str, Dict]) -> None: + """ + Process a single exchange and update conversation tracking. + + Args: + exchange: Exchange dictionary from JSONL conversation log + conversations: Conversations dictionary to update + """ + conv_id = exchange.get("conversation_id") + timestamp_str = exchange.get("timestamp") + exchange_type = exchange.get("type") # "tts" or "stt" + metadata = exchange.get("metadata", {}) + + if not conv_id or not timestamp_str: + return + + # Initialize conversation if new + if conv_id not in conversations: + conversations[conv_id] = { + "start_time": None, + "end_time": None, + "tts_count": 0, + "stt_count": 0, + "tts_providers": set(), + "stt_providers": set(), + "transport": None, + } + + conv = conversations[conv_id] + + # Parse timestamp + try: + timestamp = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) + except ValueError: + return + + # Update conversation timing + if not conv["start_time"]: + conv["start_time"] = timestamp + conv["end_time"] = timestamp + + # Count exchanges by type + if exchange_type == "tts": + conv["tts_count"] += 1 + # Track TTS provider + provider = metadata.get("provider") + if provider: + normalized = self._normalize_provider_name(provider) + conv["tts_providers"].add(normalized) + elif exchange_type == "stt": + conv["stt_count"] += 1 + # Track STT provider - check provider_url first for local providers + provider_url = metadata.get("provider_url", "") + provider = metadata.get("provider", "") + # Use the more specific provider name if available + if provider_url: + normalized = self._normalize_provider_name(provider_url) + elif provider: + normalized = self._normalize_provider_name(provider) + else: + normalized = "unknown" + conv["stt_providers"].add(normalized) + + # Track transport + transport = metadata.get("transport") + if transport: + conv["transport"] = transport + + def _normalize_provider_name(self, provider: str) -> str: + """ + Normalize provider name to a consistent format. + + Args: + provider: Provider name or URL string + + Returns: + Normalized provider name (e.g., "openai", "kokoro", "whisper-local") + """ + if not provider: + return "unknown" + + provider_lower = provider.lower() + + # OpenAI + if "openai.com" in provider_lower or provider_lower == "openai": + return "openai" + + # Kokoro TTS + if "8880" in provider_lower or provider_lower == "kokoro": + return "kokoro" + + # Local Whisper STT + if "2022" in provider_lower or provider_lower in ("whisper-local", "whisper"): + return "whisper-local" + + # OpenAI Whisper (cloud) + if "openai-whisper" in provider_lower: + return "openai-whisper" + + # No-op (for testing) + if provider_lower == "no-op": + return "no-op" + + # Other known names - return as-is if simple + if provider_lower.replace("-", "").replace("_", "").isalnum(): + return provider_lower + + # For URLs or complex strings, anonymize + return "other" + + def _extract_provider_name(self, provider_url: str) -> Optional[str]: + """ + Extract provider name from URL (legacy method, use _normalize_provider_name). + + Args: + provider_url: Provider URL string + + Returns: + Provider name (e.g., "openai", "kokoro", "whisper-local") + """ + return self._normalize_provider_name(provider_url) if provider_url else None + + def _extract_date_from_filename(self, filename: str) -> Optional[Any]: + """ + Extract date from log filename. + + Args: + filename: Log filename (e.g., "voicemode_events_2025-07-29.jsonl") + + Returns: + Date object or None if parsing fails + """ + try: + # Extract YYYY-MM-DD from filename + parts = filename.replace("voicemode_events_", "").replace(".jsonl", "") + return datetime.strptime(parts, "%Y-%m-%d").date() + except (ValueError, AttributeError): + return None + + def _extract_exchange_date_from_filename(self, filename: str) -> Optional[Any]: + """ + Extract date from exchange log filename. + + Args: + filename: Log filename (e.g., "exchanges_2025-07-29.jsonl") + + Returns: + Date object or None if parsing fails + """ + try: + # Extract YYYY-MM-DD from filename + parts = filename.replace("exchanges_", "").replace(".jsonl", "") + return datetime.strptime(parts, "%Y-%m-%d").date() + except (ValueError, AttributeError): + return None + + def _aggregate_conversation_stats(self, conversations: Dict[str, Dict]) -> Dict[str, Any]: + """ + Aggregate statistics from all conversations. + + Args: + conversations: Dictionary of conversation data + + Returns: + Aggregated statistics dictionary + """ + duration_bins = defaultdict(int) + exchange_bins = defaultdict(int) + transport_counts = defaultdict(int) + tts_provider_counts = defaultdict(int) + stt_provider_counts = defaultdict(int) + + total_conversations = len(conversations) + total_exchanges = 0 + + for conv in conversations.values(): + # Calculate conversation duration + if conv["start_time"] and conv["end_time"]: + duration_seconds = (conv["end_time"] - conv["start_time"]).total_seconds() + duration_bin = bin_duration(duration_seconds) + duration_bins[duration_bin] += 1 + + # Count total exchanges (TTS + STT) + exchanges = conv["tts_count"] + conv["stt_count"] + total_exchanges += exchanges + + # Bin exchange counts (privacy-preserving) + if exchanges == 0: + exchange_bin = "0" + elif exchanges <= 5: + exchange_bin = "1-5" + elif exchanges <= 10: + exchange_bin = "6-10" + elif exchanges <= 20: + exchange_bin = "11-20" + else: + exchange_bin = ">20" + exchange_bins[exchange_bin] += 1 + + # Transport usage + if conv["transport"]: + transport_counts[conv["transport"]] += 1 + + # Provider usage (from conversation metadata) + for provider in conv["tts_providers"]: + tts_provider_counts[provider] += 1 + for provider in conv["stt_providers"]: + stt_provider_counts[provider] += 1 + + return { + "total_sessions": total_conversations, + "total_exchanges": total_exchanges, + "duration_distribution": dict(duration_bins), + "exchanges_per_session": dict(exchange_bins), + "transport_usage": dict(transport_counts), + "provider_usage": { + "tts": dict(tts_provider_counts), + "stt": dict(stt_provider_counts), + }, + } + + def collect_environment_data(self) -> Dict[str, Any]: + """ + Collect environment and configuration data. + + Returns: + Dictionary with environment information: + - os: Operating system (matches worker schema) + - install_method: Installation method (dev/uv/pip) + - mcp_host: MCP host application (if applicable) + - exec_source: Execution source (mcp/cli) + - version: VoiceMode version (sanitized) + """ + from voice_mode import __version__ + + env_info = config.get_environment_info() + + return { + "os": env_info.get("os_type"), # Worker expects "os" not "os_type" + "install_method": env_info.get("install_method"), + "mcp_host": env_info.get("mcp_host"), + "exec_source": env_info.get("exec_source"), + "version": sanitize_version_string(__version__), + } + + def collect_telemetry_event( + self, + start_date: Optional[datetime] = None, + end_date: Optional[datetime] = None, + ) -> Dict[str, Any]: + """ + Collect a complete telemetry event payload. + + Combines session data and environment data into a single event + suitable for transmission to telemetry backend. + + Args: + start_date: Start of collection period (defaults to start of today) + end_date: End of collection period (defaults to now) + + Returns: + Complete telemetry event dictionary with period_start and period_end + """ + # Get environment data + env_data = self.collect_environment_data() + + # Set defaults for date range + if end_date is None: + end_date = datetime.now(timezone.utc) + if start_date is None: + start_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0) + + session_data = self.collect_session_data(start_date, end_date) + + # Get telemetry ID from config + telemetry_id = config.TELEMETRY_ID + + return { + "telemetry_id": telemetry_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + "period_start": start_date.isoformat(), + "period_end": end_date.isoformat(), + "environment": env_data, + "usage": session_data, + } diff --git a/voice_mode/telemetry/privacy.py b/voice_mode/telemetry/privacy.py new file mode 100644 index 00000000..8946f616 --- /dev/null +++ b/voice_mode/telemetry/privacy.py @@ -0,0 +1,207 @@ +""" +Privacy utilities for telemetry data anonymization and binning. + +This module provides functions to anonymize and bin telemetry data to protect +user privacy while still providing useful aggregate analytics. +""" + +from enum import Enum +from pathlib import Path +from typing import Optional + + +class DurationBin(str, Enum): + """Duration bins for privacy-preserving time tracking.""" + UNDER_1_MIN = "<1min" + MIN_1_TO_5 = "1-5min" + MIN_5_TO_10 = "5-10min" + MIN_10_TO_20 = "10-20min" + MIN_20_TO_60 = "20-60min" + OVER_60_MIN = ">60min" + + +class SizeBin(str, Enum): + """Size bins for privacy-preserving size tracking.""" + UNDER_50KB = "<50KB" + KB_50_TO_100 = "50-100KB" + KB_100_TO_200 = "100-200KB" + KB_200_TO_500 = "200-500KB" + OVER_500KB = ">500KB" + + +def bin_duration(seconds: float) -> str: + """ + Bin a duration in seconds into privacy-preserving categories. + + Args: + seconds: Duration in seconds + + Returns: + Duration bin string (e.g., "1-5min") + + Examples: + >>> bin_duration(30) + '<1min' + >>> bin_duration(180) + '1-5min' + >>> bin_duration(7200) + '>60min' + """ + minutes = seconds / 60.0 + + if minutes < 1: + return DurationBin.UNDER_1_MIN.value + elif minutes < 5: + return DurationBin.MIN_1_TO_5.value + elif minutes < 10: + return DurationBin.MIN_5_TO_10.value + elif minutes < 20: + return DurationBin.MIN_10_TO_20.value + elif minutes < 60: + return DurationBin.MIN_20_TO_60.value + else: + return DurationBin.OVER_60_MIN.value + + +def bin_size(size_bytes: int) -> str: + """ + Bin a size in bytes into privacy-preserving categories. + + Args: + size_bytes: Size in bytes + + Returns: + Size bin string (e.g., "50-100KB") + + Examples: + >>> bin_size(1024) + '<50KB' + >>> bin_size(75 * 1024) + '50-100KB' + >>> bin_size(1024 * 1024) + '>500KB' + """ + kb = size_bytes / 1024.0 + + if kb < 50: + return SizeBin.UNDER_50KB.value + elif kb < 100: + return SizeBin.KB_50_TO_100.value + elif kb < 200: + return SizeBin.KB_100_TO_200.value + elif kb < 500: + return SizeBin.KB_200_TO_500.value + else: + return SizeBin.OVER_500KB.value + + +def anonymize_path(path: str) -> str: + """ + Anonymize a file path by removing user-specific information. + + Replaces home directory with ~, removes username, and generalizes + project-specific paths to protect user privacy. + + Args: + path: File path to anonymize + + Returns: + Anonymized path string + + Examples: + >>> anonymize_path("/home/user/Code/project/file.py") + '~/Code/project' + >>> anonymize_path("/Users/username/Documents/work") + '~/Documents' + """ + try: + p = Path(path).expanduser().resolve() + home = Path.home() + + # If path is under home directory, use ~ notation + if p.is_relative_to(home): + relative = p.relative_to(home) + # Only keep up to 2 levels of depth for privacy + parts = relative.parts[:2] if len(relative.parts) >= 2 else relative.parts + return str(Path("~") / Path(*parts)) + + # For paths outside home, only keep first 2 components + parts = p.parts[:2] if len(p.parts) >= 2 else p.parts + return str(Path(*parts)) + + except (ValueError, RuntimeError): + # If path resolution fails, return a generic placeholder + return "~" + + +def anonymize_error_message(error_msg: str) -> Optional[str]: + """ + Anonymize error messages by removing user-specific information. + + Removes file paths, usernames, and other identifying information while + preserving the error type and general context. + + Args: + error_msg: Error message to anonymize + + Returns: + Anonymized error message, or None if message should not be tracked + + Examples: + >>> anonymize_error_message("FileNotFoundError: /home/user/file.txt") + 'FileNotFoundError: ' + >>> anonymize_error_message("Connection refused at 192.168.1.100") + 'Connection refused' + """ + if not error_msg: + return None + + # Preserve error type but anonymize details + # Common error patterns to preserve + error_types = [ + "FileNotFoundError", + "PermissionError", + "ConnectionError", + "TimeoutError", + "HTTPError", + "APIError", + "ValueError", + "TypeError", + ] + + # Extract error type if present + for error_type in error_types: + if error_type in error_msg: + return error_type + + # For other errors, return first word (usually the error type) + first_word = error_msg.split(":")[0].split()[0] + return first_word if first_word else None + + +def sanitize_version_string(version: str) -> str: + """ + Sanitize version string to remove any potentially identifying suffixes. + + Args: + version: Version string (e.g., "2.17.2+local.dev.abc123") + + Returns: + Sanitized version string (e.g., "2.17.2") + + Examples: + >>> sanitize_version_string("2.17.2") + '2.17.2' + >>> sanitize_version_string("2.17.2+local") + '2.17.2' + >>> sanitize_version_string("2.17.2-dev.abc123") + '2.17.2-dev' + """ + # Split on + to remove local version suffixes + base_version = version.split("+")[0] + + # For -dev suffixes, keep the -dev but remove hash + if "-dev." in base_version: + base_version = base_version.split(".")[0] + "-dev" + + return base_version diff --git a/voice_mode/telemetry/sender.py b/voice_mode/telemetry/sender.py new file mode 100644 index 00000000..ce59e521 --- /dev/null +++ b/voice_mode/telemetry/sender.py @@ -0,0 +1,265 @@ +""" +Telemetry auto-send module. + +Handles automatic telemetry sending on MCP server startup with: +- 24-hour cooldown between sends +- Non-blocking background execution +- Local logging for transparency +""" + +import asyncio +import json +import logging +import threading +from datetime import datetime, timezone, timedelta +from pathlib import Path +from typing import Optional + +from voice_mode import config +from voice_mode.telemetry.collector import TelemetryCollector +from voice_mode.telemetry.client import TelemetryClient + +logger = logging.getLogger(__name__) + +# Constants +COOLDOWN_HOURS = 24 +LOGS_DIR = config.BASE_DIR / "logs" / "telemetry" +MAX_LOG_AGE_DAYS = 30 + + +def get_last_send_time() -> Optional[datetime]: + """Get the timestamp of the last successful telemetry send. + + Uses the period_end from the most recent telemetry log file. + This is used for both: + - Cooldown checking (24 hours between sends) + - Determining where to start collecting data from + + Returns: + datetime of last period_end if found, None otherwise + """ + if not LOGS_DIR.exists(): + return None + + # Find the most recent telemetry log file + log_files = sorted(LOGS_DIR.glob("telemetry_*.json"), reverse=True) + if not log_files: + return None + + # Try each file starting from most recent + for log_file in log_files: + try: + with open(log_file, 'r') as f: + data = json.load(f) + + # Handle both list and single object formats + if isinstance(data, list) and data: + # Get the last entry in the list + last_entry = data[-1] + elif isinstance(data, dict): + last_entry = data + else: + continue + + # Extract period_end, falling back to timestamp for older entries + period_end_str = last_entry.get("period_end") or last_entry.get("timestamp") + if period_end_str: + return datetime.fromisoformat(period_end_str) + + except (json.JSONDecodeError, OSError, ValueError) as e: + logger.debug(f"Failed to read period_end from {log_file}: {e}") + continue + + return None + + +def should_send_telemetry() -> bool: + """Determine if telemetry should be sent. + + Returns True if: + - Telemetry is enabled + - Endpoint is configured + - At least COOLDOWN_HOURS have passed since last send (or never sent) + """ + # Check if telemetry is enabled + if not config.is_telemetry_enabled(): + logger.debug("Telemetry not enabled, skipping send") + return False + + # Check if endpoint is configured + if not config.VOICEMODE_TELEMETRY_ENDPOINT: + logger.debug("No telemetry endpoint configured, skipping send") + return False + + # Check cooldown + last_send = get_last_send_time() + if last_send is not None: + now = datetime.now(timezone.utc) + hours_since_last = (now - last_send).total_seconds() / 3600 + if hours_since_last < COOLDOWN_HOURS: + logger.debug( + f"Telemetry sent {hours_since_last:.1f} hours ago, " + f"waiting until {COOLDOWN_HOURS} hours have passed" + ) + return False + + return True + + +def log_telemetry_payload(payload: dict) -> None: + """Save telemetry payload to local log file for transparency. + + Creates a date-stamped JSON file in ~/.voicemode/logs/telemetry/ + so users can audit exactly what data was sent. + + Args: + payload: The telemetry event payload that was sent + """ + LOGS_DIR.mkdir(parents=True, exist_ok=True) + + today = datetime.now().strftime("%Y-%m-%d") + log_file = LOGS_DIR / f"telemetry_{today}.json" + + try: + # Append to existing file or create new one + if log_file.exists(): + # Read existing content + with open(log_file, 'r') as f: + try: + existing = json.load(f) + if not isinstance(existing, list): + existing = [existing] + except json.JSONDecodeError: + existing = [] + existing.append(payload) + else: + existing = [payload] + + # Write with pretty formatting for readability + with open(log_file, 'w') as f: + json.dump(existing, f, indent=2, default=str) + + logger.info(f"Telemetry payload logged to: {log_file}") + + except Exception as e: + logger.warning(f"Failed to log telemetry payload: {e}") + + +def cleanup_old_logs() -> int: + """Remove telemetry log files older than MAX_LOG_AGE_DAYS. + + Returns: + Number of files removed + """ + if not LOGS_DIR.exists(): + return 0 + + removed = 0 + cutoff = datetime.now(timezone.utc).timestamp() - (MAX_LOG_AGE_DAYS * 86400) + + for log_file in LOGS_DIR.glob("telemetry_*.json"): + try: + if log_file.stat().st_mtime < cutoff: + log_file.unlink() + removed += 1 + logger.debug(f"Removed old telemetry log: {log_file.name}") + except Exception as e: + logger.warning(f"Failed to remove old log {log_file}: {e}") + + if removed: + logger.info(f"Cleaned up {removed} old telemetry log files") + + return removed + + +def send_telemetry_sync() -> bool: + """Synchronous telemetry send function. + + Collects telemetry data since the last send and transmits to configured endpoint. + Uses period_end from the last telemetry log as the start date for continuous coverage. + Logs the payload locally for transparency (which also serves as the cooldown tracker). + + Returns: + True if send was successful, False otherwise + """ + try: + # Determine collection period + # Start from last period_end to ensure continuous coverage + start_date = get_last_send_time() + end_date = datetime.now(timezone.utc) + + if start_date is None: + # First send ever - collect last 24 hours + start_date = end_date - timedelta(hours=24) + logger.info("First telemetry send - collecting last 24 hours") + else: + logger.info(f"Collecting telemetry from {start_date} to {end_date}") + + # Collect telemetry event for the period + collector = TelemetryCollector() + event = collector.collect_telemetry_event(start_date=start_date, end_date=end_date) + + if not event: + logger.warning("No telemetry data collected") + return False + + # Log the payload locally before sending (transparency) + # This also serves as the "last send" marker for cooldown checking + log_telemetry_payload(event) + + # Send to endpoint + client = TelemetryClient(endpoint_url=config.VOICEMODE_TELEMETRY_ENDPOINT) + success = client.send_event(event) + + if success: + logger.info("Telemetry sent successfully") + # Clean up old logs while we're at it + cleanup_old_logs() + # Also try to send any queued events + queued = client.send_queued_events() + if queued: + logger.info(f"Also sent {queued} queued telemetry events") + else: + logger.warning("Telemetry send failed (will retry later)") + + return success + + except Exception as e: + logger.error(f"Error sending telemetry: {e}") + return False + + +def maybe_send_telemetry_background() -> None: + """Send telemetry in a background thread if appropriate. + + This is the main entry point called from server startup. + Runs telemetry send in a separate thread to avoid blocking. + """ + if not should_send_telemetry(): + return + + def _send_in_background(): + try: + logger.info("Starting background telemetry send...") + send_telemetry_sync() + except Exception as e: + logger.error(f"Background telemetry send failed: {e}") + + # Run in background thread to avoid blocking server startup + thread = threading.Thread(target=_send_in_background, daemon=True) + thread.start() + logger.debug("Telemetry send initiated in background thread") + + +async def maybe_send_telemetry_async() -> bool: + """Async version of telemetry send for use in async contexts. + + Returns: + True if telemetry was sent successfully, False otherwise + """ + if not should_send_telemetry(): + return False + + # Run the sync function in a thread pool to not block the event loop + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, send_telemetry_sync) diff --git a/voice_mode/tools/telemetry_management.py b/voice_mode/tools/telemetry_management.py new file mode 100644 index 00000000..ee6743af --- /dev/null +++ b/voice_mode/tools/telemetry_management.py @@ -0,0 +1,276 @@ +"""Telemetry management tools for VoiceMode.""" + +import os +import re +from pathlib import Path +from typing import Dict + +from voice_mode.server import mcp +from voice_mode.config import ( + logger, + BASE_DIR, + get_telemetry_status, + is_telemetry_enabled, +) + + +# Configuration file path (user-level only for security) +USER_CONFIG_PATH = Path.home() / ".voicemode" / "voicemode.env" +# Legacy path for backwards compatibility +LEGACY_CONFIG_PATH = Path.home() / ".voicemode" / ".voicemode.env" + + +def parse_env_file(file_path: Path) -> Dict[str, str]: + """Parse an environment file and return a dictionary of key-value pairs.""" + config = {} + if not file_path.exists(): + return config + + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + # Parse KEY=VALUE format + match = re.match(r'^([A-Z_]+)=(.*)$', line) + if match: + key, value = match.groups() + # Remove quotes if present + value = value.strip('"').strip("'") + config[key] = value + + return config + + +def write_env_file(file_path: Path, config: Dict[str, str], preserve_comments: bool = True): + """Write configuration to an environment file. + + Handles three cases: + 1. Active config line (KEY=value) - replace with new value if key in config + 2. Commented config line (# KEY=value) - replace with active value if key in config + 3. Regular comments (# some text) - preserve as-is + """ + # Read existing file to preserve comments and structure + existing_lines = [] + existing_keys = set() + + # Pattern for commented-out config lines: # KEY=value or #KEY=value + commented_config_pattern = re.compile(r'^#\s*([A-Z][A-Z0-9_]*)=') + + if file_path.exists() and preserve_comments: + with open(file_path, 'r') as f: + for line in f: + stripped = line.strip() + if stripped and not stripped.startswith('#'): + # Active config line + match = re.match(r'^([A-Z_]+)=', stripped) + if match: + key = match.group(1) + existing_keys.add(key) + if key in config: + # Replace with new value + existing_lines.append(f"{key}={config[key]}\n") + else: + # Keep existing line + existing_lines.append(line) + else: + existing_lines.append(line) + elif stripped.startswith('#'): + # Check if this is a commented-out config line + commented_match = commented_config_pattern.match(stripped) + if commented_match: + key = commented_match.group(1) + if key in config: + # Replace commented default with active value + existing_lines.append(f"{key}={config[key]}\n") + existing_keys.add(key) + else: + # Keep the commented default as-is + existing_lines.append(line) + else: + # Regular comment - preserve as-is + existing_lines.append(line) + else: + # Empty lines + existing_lines.append(line) + + # Add new keys that weren't in the file + new_keys = set(config.keys()) - existing_keys + if new_keys and existing_lines: + # Add a newline before new entries if file has content + if existing_lines and not existing_lines[-1].strip() == '': + existing_lines.append('\n') + + # Group telemetry keys together + telemetry_keys = sorted([k for k in new_keys if k.startswith('VOICEMODE_TELEMETRY')]) + other_keys = sorted([k for k in new_keys if not k.startswith('VOICEMODE_TELEMETRY')]) + + if telemetry_keys: + existing_lines.append("# Telemetry Configuration\n") + for key in telemetry_keys: + existing_lines.append(f"{key}={config[key]}\n") + existing_lines.append('\n') + + if other_keys: + existing_lines.append("# Additional Configuration\n") + for key in other_keys: + existing_lines.append(f"{key}={config[key]}\n") + + # Write the file + file_path.parent.mkdir(parents=True, exist_ok=True) + with open(file_path, 'w') as f: + f.writelines(existing_lines if existing_lines else [f"{k}={v}\n" for k, v in sorted(config.items())]) + + # Set appropriate permissions (readable/writable by owner only) + os.chmod(file_path, 0o600) + + +@mcp.tool() +async def telemetry_set_preference(enabled: bool) -> str: + """Set user's telemetry preference (opt-in or opt-out). + + This tool records the user's telemetry choice in the configuration file + (~/.voicemode/voicemode.env) and reloads the configuration. + + Args: + enabled: True to enable telemetry (opt-in), False to disable (opt-out) + + Returns: + Confirmation message with the updated telemetry status + + Privacy note: + - This tool only updates the local configuration file + - No data is sent when opting out + - When opting in, only anonymous usage stats are collected + - Users can change their preference at any time + """ + # Use user config path, check for legacy if new doesn't exist + config_path = USER_CONFIG_PATH + if not config_path.exists() and LEGACY_CONFIG_PATH.exists(): + config_path = LEGACY_CONFIG_PATH + logger.warning(f"Using deprecated .voicemode.env - please rename to voicemode.env") + + # Check if DO_NOT_TRACK is set (overrides everything) + if os.getenv("DO_NOT_TRACK") is not None: + return """⚠️ Cannot change telemetry preference + +DO_NOT_TRACK environment variable is set, which overrides all telemetry settings. +Telemetry is disabled and cannot be enabled while DO_NOT_TRACK is set. + +To enable telemetry: +1. Unset the DO_NOT_TRACK environment variable +2. Use this tool again to set your preference + +Note: DO_NOT_TRACK is a universal opt-out standard that disables telemetry +across many tools and services.""" + + try: + # Read existing configuration + config = parse_env_file(config_path) + + # Set VOICEMODE_TELEMETRY + new_value = "true" if enabled else "false" + old_value = config.get("VOICEMODE_TELEMETRY", "ask") + + config["VOICEMODE_TELEMETRY"] = new_value + + # Write back to file + write_env_file(config_path, config) + + # Log the change + logger.info(f"Telemetry preference updated: {old_value} -> {new_value}") + + # Build response message + lines = [] + if enabled: + lines.append("✅ Telemetry enabled - Thank you!") + lines.append("") + lines.append("VoiceMode will now collect anonymous usage statistics to help") + lines.append("improve the tool. This includes:") + lines.append(" • Session counts and durations (binned)") + lines.append(" • Voice exchanges per session") + lines.append(" • TTS/STT provider usage") + lines.append(" • Success/failure rates") + lines.append(" • Anonymized error types") + lines.append("") + lines.append("Remember:") + lines.append(" • No personal information is collected") + lines.append(" • All data is anonymized") + lines.append(" • You can opt-out anytime") + else: + lines.append("✅ Telemetry disabled") + lines.append("") + lines.append("VoiceMode will not collect any usage statistics.") + lines.append("No data will be sent to any servers.") + lines.append("") + lines.append("You can enable telemetry later if you change your mind.") + + lines.append("") + lines.append(f"Configuration saved to: {config_path}") + lines.append(f"Previous setting: VOICEMODE_TELEMETRY={old_value}") + lines.append(f"New setting: VOICEMODE_TELEMETRY={new_value}") + lines.append("") + lines.append("Note: Changes take effect immediately. The MCP server does not need") + lines.append("to be restarted.") + + return "\n".join(lines) + + except Exception as e: + logger.error(f"Failed to set telemetry preference: {e}") + return f"❌ Failed to set telemetry preference: {str(e)}" + + +@mcp.tool() +async def telemetry_check_status() -> str: + """Check current telemetry status and configuration. + + This tool provides a quick summary of the telemetry configuration without + the detailed information shown in the telemetry status resource. + + Returns: + Brief status message showing if telemetry is enabled and why + """ + status = get_telemetry_status() + + lines = [] + lines.append("Telemetry Status") + lines.append("=" * 40) + lines.append("") + + # Status + status_emoji = "✅" if status["enabled"] else "❌" + lines.append(f"Status: {status_emoji} {'ENABLED' if status['enabled'] else 'DISABLED'}") + lines.append(f"Reason: {status['reason']}") + lines.append("") + + # Quick actions + if status["voicemode_telemetry"] == "ask": + lines.append("⚠️ User has not been prompted for telemetry consent") + lines.append("") + lines.append("Next steps:") + lines.append(" 1. Read voicemode://telemetry/opt-in-prompt for prompt text") + lines.append(" 2. Ask the user if they want to enable telemetry") + lines.append(" 3. Use telemetry_set_preference(enabled=true/false) to record choice") + elif status["enabled"]: + lines.append("Telemetry is active and collecting anonymous usage statistics.") + lines.append("") + lines.append("To disable:") + lines.append(" • Use telemetry_set_preference(enabled=false)") + lines.append(" • Or set DO_NOT_TRACK=1 environment variable") + else: + lines.append("Telemetry is disabled. No usage data is being collected.") + lines.append("") + lines.append("To enable:") + if status["do_not_track"]: + lines.append(" • Unset DO_NOT_TRACK environment variable") + lines.append(" • Then use telemetry_set_preference(enabled=true)") + else: + lines.append(" • Use telemetry_set_preference(enabled=true)") + + lines.append("") + lines.append("For more information:") + lines.append(" • Read voicemode://telemetry/status resource") + lines.append(" • See Privacy section in VoiceMode documentation") + + return "\n".join(lines) diff --git a/voice_mode/utils/telemetry_prompt.py b/voice_mode/utils/telemetry_prompt.py new file mode 100644 index 00000000..585da45f --- /dev/null +++ b/voice_mode/utils/telemetry_prompt.py @@ -0,0 +1,286 @@ +"""Utility functions for telemetry opt-in prompts.""" + +import os +import re +import sys +from pathlib import Path +from typing import Dict, Optional + + +USER_CONFIG_PATH = Path.home() / ".voicemode" / "voicemode.env" +LEGACY_CONFIG_PATH = Path.home() / ".voicemode" / ".voicemode.env" + + +def should_prompt_for_telemetry() -> bool: + """Check if we should prompt the user for telemetry consent. + + Returns True if: + - VOICEMODE_TELEMETRY is set to 'ask' (default) + - DO_NOT_TRACK is not set + - We're in an interactive terminal (not MCP/pipe mode) + - User hasn't already been prompted (checked via config file) + + Returns: + True if user should be prompted, False otherwise + """ + # Check if DO_NOT_TRACK is set (overrides everything) + if os.getenv("DO_NOT_TRACK") is not None: + return False + + # Check VOICEMODE_TELEMETRY setting + voicemode_telemetry = os.getenv("VOICEMODE_TELEMETRY", "ask").lower() + if voicemode_telemetry != "ask": + # User has already made a choice (true or false) + return False + + # Check if we're in interactive mode (not a pipe or MCP mode) + if not sys.stdin.isatty() or not sys.stdout.isatty(): + return False + + # Check if telemetry preference is already set in config file + config_path = USER_CONFIG_PATH + if not config_path.exists() and LEGACY_CONFIG_PATH.exists(): + config_path = LEGACY_CONFIG_PATH + + if config_path.exists(): + try: + with open(config_path, 'r') as f: + content = f.read() + # Check if VOICEMODE_TELEMETRY is explicitly set in the file + # (not just commented out) + if re.search(r'^VOICEMODE_TELEMETRY=', content, re.MULTILINE): + # Already configured in file + return False + except Exception: + pass + + # All checks passed - user should be prompted + return True + + +def prompt_for_telemetry_consent() -> Optional[bool]: + """Prompt user for telemetry consent in interactive CLI mode. + + Shows a clear, concise prompt explaining what telemetry collects and + asks the user to opt-in or opt-out. + + Returns: + True if user opts in, False if user opts out, None if prompt fails + """ + if not should_prompt_for_telemetry(): + return None + + try: + # Clear screen and show prompt + print("\n" + "=" * 70) + print("VoiceMode Telemetry") + print("=" * 70) + print() + print("VoiceMode would like to collect anonymous usage statistics") + print("to help improve the tool.") + print() + print("What we collect:") + print(" • Session counts and durations (binned for privacy)") + print(" • Voice exchanges per session") + print(" • TTS/STT provider usage (openai, kokoro, whisper)") + print(" • Success/failure rates") + print(" • Anonymized error types") + print() + print("What we DON'T collect:") + print(" • Your conversations or voice recordings") + print(" • Personal information or file paths") + print(" • API keys or credentials") + print(" • Anything that could identify you") + print() + print("Privacy protections:") + print(" • All data is anonymized with a random UUID") + print(" • Numbers are binned to prevent identification") + print(" • You can opt-out anytime with DO_NOT_TRACK=1") + print() + print("=" * 70) + + # Get user response + while True: + response = input("\nEnable telemetry? [y/N]: ").strip().lower() + + if response in ['y', 'yes']: + return True + elif response in ['n', 'no', '']: + # Default to no if user just presses enter + return False + else: + print("Please enter 'y' for yes or 'n' for no (or press Enter for no)") + + except (EOFError, KeyboardInterrupt): + # User interrupted prompt (Ctrl+C or Ctrl+D) + print("\n\nTelemetry prompt cancelled - defaulting to disabled") + return False + except Exception as e: + # Unexpected error - fail safely by not enabling telemetry + print(f"\nError during telemetry prompt: {e}") + print("Defaulting to telemetry disabled") + return False + + +def parse_env_file(file_path: Path) -> Dict[str, str]: + """Parse an environment file and return a dictionary of key-value pairs.""" + config = {} + if not file_path.exists(): + return config + + with open(file_path, 'r') as f: + for line in f: + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + # Parse KEY=VALUE format + match = re.match(r'^([A-Z_]+)=(.*)$', line) + if match: + key, value = match.groups() + # Remove quotes if present + value = value.strip('"').strip("'") + config[key] = value + + return config + + +def write_env_file(file_path: Path, config: Dict[str, str], preserve_comments: bool = True): + """Write configuration to an environment file. + + Handles three cases: + 1. Active config line (KEY=value) - replace with new value if key in config + 2. Commented config line (# KEY=value) - replace with active value if key in config + 3. Regular comments (# some text) - preserve as-is + """ + # Read existing file to preserve comments and structure + existing_lines = [] + existing_keys = set() + + # Pattern for commented-out config lines: # KEY=value or #KEY=value + commented_config_pattern = re.compile(r'^#\s*([A-Z][A-Z0-9_]*)=') + + if file_path.exists() and preserve_comments: + with open(file_path, 'r') as f: + for line in f: + stripped = line.strip() + if stripped and not stripped.startswith('#'): + # Active config line + match = re.match(r'^([A-Z_]+)=', stripped) + if match: + key = match.group(1) + existing_keys.add(key) + if key in config: + # Replace with new value + existing_lines.append(f"{key}={config[key]}\n") + else: + # Keep existing line + existing_lines.append(line) + else: + existing_lines.append(line) + elif stripped.startswith('#'): + # Check if this is a commented-out config line + commented_match = commented_config_pattern.match(stripped) + if commented_match: + key = commented_match.group(1) + if key in config: + # Replace commented default with active value + existing_lines.append(f"{key}={config[key]}\n") + existing_keys.add(key) + else: + # Keep the commented default as-is + existing_lines.append(line) + else: + # Regular comment - preserve as-is + existing_lines.append(line) + else: + # Empty lines + existing_lines.append(line) + + # Add new keys that weren't in the file + new_keys = set(config.keys()) - existing_keys + if new_keys and existing_lines: + # Add a newline before new entries if file has content + if existing_lines and not existing_lines[-1].strip() == '': + existing_lines.append('\n') + + # Add telemetry configuration section + existing_lines.append("#############\n") + existing_lines.append("# Telemetry Configuration\n") + existing_lines.append("#############\n") + existing_lines.append("\n") + for key in sorted(new_keys): + existing_lines.append(f"{key}={config[key]}\n") + + # Write the file + file_path.parent.mkdir(parents=True, exist_ok=True) + with open(file_path, 'w') as f: + f.writelines(existing_lines if existing_lines else [f"{k}={v}\n" for k, v in sorted(config.items())]) + + # Set appropriate permissions (readable/writable by owner only) + os.chmod(file_path, 0o600) + + +def save_telemetry_preference(enabled: bool) -> bool: + """Save user's telemetry preference to configuration file. + + Args: + enabled: True to enable telemetry, False to disable + + Returns: + True if saved successfully, False otherwise + """ + config_path = USER_CONFIG_PATH + if not config_path.exists() and LEGACY_CONFIG_PATH.exists(): + config_path = LEGACY_CONFIG_PATH + + try: + # Read existing configuration + config = parse_env_file(config_path) + + # Set VOICEMODE_TELEMETRY + config["VOICEMODE_TELEMETRY"] = "true" if enabled else "false" + + # Write back to file + write_env_file(config_path, config) + + return True + + except Exception as e: + print(f"Warning: Failed to save telemetry preference: {e}") + return False + + +def maybe_prompt_for_telemetry(): + """Check if telemetry prompt is needed and show it if appropriate. + + This is the main entry point for CLI commands to check and prompt + for telemetry consent. + + If the user needs to be prompted, shows the prompt and saves their + preference to the configuration file. + """ + if not should_prompt_for_telemetry(): + return + + # Prompt user + consent = prompt_for_telemetry_consent() + + if consent is None: + # Prompt failed or was not shown + return + + # Save preference + if save_telemetry_preference(consent): + if consent: + print("\n✅ Telemetry enabled - Thank you!") + else: + print("\n✅ Telemetry disabled") + + print(f"Preference saved to: {USER_CONFIG_PATH}") + print("You can change this anytime by editing the file or setting") + print("VOICEMODE_TELEMETRY=true/false in your environment.\n") + else: + print("\n⚠️ Could not save preference to config file") + print("You can manually set VOICEMODE_TELEMETRY=true/false") + print(f"in {USER_CONFIG_PATH}\n")