ThemisDB/config/graph_protection.yaml at develop · makr-code/ThemisDB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# Graph Protection Configuration Example
#
# This configuration file demonstrates recommended settings for protecting
# knowledge graphs and vector embeddings from unauthorized extraction.
#
# Based on research: "Making Stolen Data Unusable for AI Training" (2026)
# https://www.golem.de/news/schutz-fuer-wissensgraphen-forscher-machen-gestohlene-daten-fuer-ki-unbrauchbar-2601-203870.html

# ============================================================================
# GRAPH PROTECTION SETTINGS
# ============================================================================

graph_protection:
  # Enable graph-specific security features
  enabled: true

  # -------------------------------------------------------------------------
  # ACCESS MONITORING
  # Track and analyze access patterns to detect potential data exfiltration
  # -------------------------------------------------------------------------
  access_monitoring:
    enabled: true

    # Log all graph traversal operations (BFS, DFS, shortest path, etc.)
    track_traversals: true

    # Log all bulk export operations
    track_exports: true

    # Log vector embedding queries
    track_embedding_access: true

    # Enable real-time anomaly detection
    anomaly_detection: true

    # Anomaly detection thresholds
    anomaly_thresholds:
      # Alert if user traverses more than N nodes per minute
      max_nodes_per_minute: 5000

      # Alert if user performs more than N deep traversals per hour
      max_deep_traversals_per_hour: 50

      # Alert if user exports more than N MB per day
      max_export_mb_per_day: 500

      # Alert if user accesses embeddings more than N times per minute
      max_embedding_queries_per_minute: 100

    # Suspicious pattern detection
    patterns:
      # Detect systematic node enumeration (e.g., sequential PK access)
      detect_enumeration: true

      # Detect exhaustive graph crawling
      detect_crawling: true

      # Detect time-of-day anomalies (e.g., bulk access at 3 AM)
      detect_temporal_anomalies: true

      # Detect geographic anomalies (e.g., access from unusual locations)
      detect_geographic_anomalies: true

  # -------------------------------------------------------------------------
  # RATE LIMITING
  # Restrict query volume and complexity to prevent bulk extraction
  # -------------------------------------------------------------------------
  rate_limits:
    # Graph traversal limits
    max_traversal_depth: 5                # Maximum BFS/DFS depth
    max_nodes_per_query: 1000            # Maximum nodes returned per query
    max_edges_per_query: 10000           # Maximum edges returned per query

    # Vector search limits
    max_embeddings_per_query: 500        # Maximum embeddings per KNN search
    max_vector_dimensions: 1536          # Maximum embedding dimensions

    # Query frequency limits (per user)
    queries_per_minute: 50               # General query limit
    graph_queries_per_minute: 30         # Graph-specific queries
    vector_queries_per_minute: 100       # Vector search queries
    traversals_per_hour: 200             # Deep traversal operations

    # Bulk operation limits
    bulk_operations_per_day: 10          # Maximum bulk exports per day

    # Concurrent query limits
    max_concurrent_queries: 5            # Per user

    # Time-window based limits
    hourly_node_access_limit: 50000      # Nodes per hour
    daily_data_transfer_limit_mb: 1000   # MB per day

  # -------------------------------------------------------------------------
  # EXPORT CONTROLS
  # Restrict and monitor bulk data exports
  # -------------------------------------------------------------------------
  export_controls:
    # Disable bulk export by default (enable only for approved users)
    bulk_export_enabled: false

    # Require manual approval for large exports
    require_approval: true
    approval_threshold_mb: 100           # Exports > 100 MB require approval

    # Maximum export size limits
    max_export_size_mb: 500              # Hard limit per export
    max_export_nodes: 100000             # Maximum nodes per export
    max_export_edges: 500000             # Maximum edges per export

    # Export audit trail
    audit_all_exports: true
    include_export_hash: true            # Hash exported data for tracking

    # Watermarking for exports (Phase 2 - optional)
    watermark_exports: false             # Enable after watermarking implemented

    # Export scheduling (restrict to business hours)
    restrict_to_business_hours: false
    business_hours:
      start: "08:00"
      end: "18:00"
      timezone: "UTC"

  # -------------------------------------------------------------------------
  # GRAPH WATERMARKING (Phase 2 - Future Feature)
  # Embed imperceptible marks in graph structure for theft detection
  # -------------------------------------------------------------------------
  watermarking:
    # Enable graph watermarking (requires implementation)
    enabled: false

    # Watermark strength (low, medium, high)
    # Higher strength = more robust but potentially more detectable
    strength: medium

    # Watermark method
    method: edge_perturbation      # Options: edge_perturbation, dummy_nodes, weight_modification

    # Watermark key rotation
    key_rotation_days: 90

    # Watermark verification
    verify_on_export: true
    verify_on_query: false         # May impact performance

  # -------------------------------------------------------------------------
  # EMBEDDING PROTECTION (Phase 2 - Future Feature)
  # Add imperceptible noise to embeddings for fingerprinting
  # -------------------------------------------------------------------------
  embedding_protection:
    # Enable embedding fingerprinting (requires implementation)
    enabled: false

    # Noise magnitude (0.0 - 1.0)
    # Lower values = less impact on search quality
    noise_magnitude: 0.01

    # Noise method
    method: deterministic_gaussian  # Options: deterministic_gaussian, laplace, uniform

    # Fingerprint verification
    verify_on_access: false        # May impact performance

    # Secret key management
    key_rotation_days: 90
    use_hsm: true                  # Use Hardware Security Module if available

  # -------------------------------------------------------------------------
  # DIFFERENTIAL PRIVACY (Phase 3 - Future Feature)
  # Add noise to aggregations for privacy preservation
  # -------------------------------------------------------------------------
  differential_privacy:
    # Enable differential privacy (requires implementation)
    enabled: false

    # Privacy parameters
    epsilon: 1.0                   # Privacy budget (lower = more private)
    delta: 1e-5                    # Failure probability

    # Apply to specific query types
    apply_to_aggregations: true
    apply_to_counts: true
    apply_to_statistics: true

    # Noise mechanism
    mechanism: laplace             # Options: laplace, gaussian

    # Privacy budget management
    budget_per_user_per_day: 10.0
    budget_reset_interval_hours: 24

# ============================================================================
# INTEGRATION WITH EXISTING SECURITY FEATURES
# ============================================================================

security:
  # -------------------------------------------------------------------------
  # RBAC INTEGRATION
  # Define graph-specific permissions
  # -------------------------------------------------------------------------
  rbac:
    enabled: true

    # Example roles with graph permissions
    roles:
      - name: data_viewer
        graph_permissions:
          read: true
          traverse: true
          max_depth: 3
          export: false

      - name: data_analyst
        graph_permissions:
          read: true
          traverse: true
          max_depth: 5
          export: true
          export_approval_required: true

      - name: data_scientist
        graph_permissions:
          read: true
          traverse: true
          max_depth: 10
          export: true
          bulk_export: true
          export_approval_required: false

      - name: admin
        graph_permissions:
          read: true
          write: true
          traverse: true
          max_depth: -1  # unlimited
          export: true
          bulk_export: true
          configure_protection: true

  # -------------------------------------------------------------------------
  # AUDIT LOGGING
  # Enhanced logging for graph operations
  # -------------------------------------------------------------------------
  audit:
    enabled: true

    # Graph-specific events
    log_events:
      - GRAPH_TRAVERSAL
      - BULK_NODE_ACCESS
      - BULK_EDGE_ACCESS
      - EMBEDDING_EXPORT
      - GRAPH_EXPORT
      - TEMPORAL_QUERY
      - ANOMALY_DETECTED

    # Include detailed metadata
    include_query_details: true
    include_access_patterns: true
    include_data_volume: true

    # Retention
    retention_days: 365

    # SIEM integration
    siem_enabled: true

  # -------------------------------------------------------------------------
  # ENCRYPTION
  # Protect data at rest and in transit
  # -------------------------------------------------------------------------
  encryption:
    # Field-level encryption for sensitive graph attributes
    field_encryption: true

    # Vector embedding encryption
    vector_encryption: true

    # Encrypt audit logs
    audit_log_encryption: true

# ============================================================================
# MONITORING & ALERTING
# ============================================================================

monitoring:
  # Prometheus metrics
  prometheus:
    enabled: true

    # Graph-specific metrics
    metrics:
      - themis_graph_traversal_depth
      - themis_graph_nodes_accessed
      - themis_graph_edges_accessed
      - themis_embeddings_queried
      - themis_graph_exports_total
      - themis_graph_anomalies_detected

  # Alert rules
  alerts:
    - name: SuspiciousGraphTraversal
      condition: rate(themis_graph_traversal_depth_bucket{le="10"}[5m]) > 10
      severity: warning
      description: "Unusual deep graph traversal detected"

    - name: BulkGraphExport
      condition: rate(themis_graph_nodes_exported[5m]) > 1000
      severity: critical
      description: "Large-scale graph export detected"

    - name: EmbeddingTheft
      condition: rate(themis_embeddings_queried[5m]) > 500
      severity: warning
      description: "Suspicious embedding access pattern"

    - name: GraphAnomalyDetected
      condition: themis_graph_anomalies_detected > 0
      severity: high
      description: "Anomalous graph access pattern detected"

# ============================================================================
# DEPLOYMENT RECOMMENDATIONS
# ============================================================================

# For production deployments:
# 1. Start with conservative limits and adjust based on legitimate usage
# 2. Enable access monitoring and review logs regularly
# 3. Configure alerts for suspicious activities
# 4. Conduct regular security audits
# 5. Plan for Phase 2 features (watermarking, fingerprinting) if needed
# 6. Document approved export workflows
# 7. Train users on security best practices

# For high-security environments:
# 1. Enable all monitoring features
# 2. Set strict rate limits
# 3. Require approval for all exports
# 4. Implement watermarking (Phase 2)
# 5. Enable differential privacy (Phase 3) for aggregations
# 6. Use HSM for key management
# 7. Deploy in air-gapped network if possible