cleancloud/cleancloud.yaml at main · cleancloud-io/cleancloud · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# CleanCloud default config (auto-detected)
# Safe baseline for local scans.
# Override via --config or CLI flags in CI.
# Tip: Run `cleancloud doctor` to validate credentials and permissions before scanning.
#
# Resolution order (highest priority first):
#   1. --config <path>              explicit path
#   2. ./cleancloud.yaml            this file (project-level)
#   3. ~/.cleancloud.yaml           user-level defaults (future)
#   4. .cleancloud/config.yaml      alternate project location
version: 1

# ── Scan defaults (execution context) ────────────────────────────────────────
# CLI flags always take precedence. These kick in when flags are omitted.
# Lets you run: cleancloud scan  (no other flags needed)
scan:
  provider: aws                   # optional if only one provider is configured via credentials
  regions: auto                   # auto-detect active regions (equivalent to --all-regions)
  # regions:                      # or pin to specific regions (equivalent to --region)
  #   - us-east-1

# ── Filtering precedence (strongest first) ────────────────────────────────────
# 1. Exceptions       — explicit human approvals, bypass all other filters
# 2. Tag filtering    — resource-scope exclusion by tag/label
# 3. Rule enable/disable + params — pre-scan rule selection
# 4. Defaults         — global fallbacks for min_cost / confidence / override_risk_level
# 5. Thresholds       — CI/CD exit-code policy

# ── Defaults ─────────────────────────────────────────────────────────────────
# Global fallbacks applied to all rules unless overridden at the rule level.
# min_cost: per-finding, using estimated_monthly_cost_usd (not total across findings).
defaults:
  min_cost: 5                   # suppress findings below $5/month (low-noise baseline)
  confidence: MEDIUM            # suppress LOW confidence findings globally

# ── Tag filtering ────────────────────────────────────────────────────────────
# Suppress findings on resources that carry these tags/labels.
# Applied AFTER exceptions — explicitly-excepted resources are never re-suppressed by a tag rule.
# Useful for excluding intentional infrastructure (prod keep-warm, IaC-managed resources).
tag_filtering:
  enabled: true
  mode: exclude               # "exclude" suppresses findings on matched resources (only supported mode)
  ignore:
    - key: env
      values: [production, staging]   # suppress findings on prod and staging resources
    - key: cleancloud-ignore          # key-only match (any value)

# ── Rule configuration ───────────────────────────────────────────────────────
# Enable or disable rules, tune thresholds, or pass parameters to individual rules.
# Rules not listed here are enabled by default (opt-out model).
# Per-rule settings override defaults above.
# Rule IDs must match exactly — a typo raises an error at scan start with a suggestion.
# Run `cleancloud scan --help` or see docs/rules.md for the full list.
rules:
  aws.resource.untagged:
    enabled: false              # team manages tags separately

  aws.ec2.ami.old:
    enabled: true
    min_cost: 5                 # only flag AMIs with estimated cost >= $5/month (overrides default)

  aws.rds.instance.idle:
    enabled: true
    params:
      idle_days_threshold: 21   # require 21 days idle before flagging (default: 14)

  gcp.sql.instance.idle:
    enabled: true
    params:
      idle_days: 21             # require 21 days idle before flagging (default: 14)

  azure.compute.snapshot.old:
    enabled: true
    params:
      max_age_days: 60          # flag snapshots older than 60 days (default: 90)
    confidence: LOW             # include LOW confidence findings for this rule (overrides global default)

  aws.sagemaker.endpoint.idle:
    enabled: true
    # override_risk_level overrides the `risk` field on findings — affects display and reporting only.
    # It does NOT affect fail_on_confidence thresholds (which use signal strength, not risk).
    override_risk_level: HIGH

  # aws.ec2.gpu.idle:
  #   params:
  #     idle_days: 14           # default: 7   (days of low utilisation before flagging)
  #     gpu_threshold: 10.0     # default: 5.0 (max GPU utilisation %, HIGH confidence path)
  #     cpu_threshold: 20.0     # default: 10.0 (max CPU utilisation %, MEDIUM confidence fallback)

  # aws.bedrock.provisioned_throughput.idle:
  #   params:
  #     idle_days: 14           # default: 7 (days of zero invocations before flagging)

  # aws.sagemaker.studio_app.idle:
  #   params:
  #     idle_days: 14           # default: 7 (days of no user activity before flagging)

  # azure.ml.online_endpoint.idle:
  #   params:
  #     idle_days: 14           # default: 7 (days of zero scoring requests before flagging)

  # azure.ai_search.idle:
  #   params:
  #     idle_days: 60           # default: 30 (days of zero queries before flagging)

  # azure.openai.provisioned_deployment.idle:
  #   params:
  #     idle_days: 14           # default: 7 (days of zero requests before flagging)

  # gcp.tpu.idle:
  #   params:
  #     idle_days: 14           # default: 7 (days of low duty-cycle before flagging)

  # gcp.vertex.featurestore.idle:
  #   params:
  #     idle_days: 60           # default: 30 (days of zero online serving requests before flagging)

# ── Categories ───────────────────────────────────────────────────────────────
# Override the default category without using the --category CLI flag.
# Equivalent to: cleancloud scan --provider aws --category all
# CLI --category flag takes precedence when explicitly set.
# categories:
#   include: [hygiene, ai]    # run both hygiene and AI/ML rules

# ── Exceptions ───────────────────────────────────────────────────────────────
# Strongest filter — excepted resources bypass tag filtering and all other suppression.
# Fields: rule_id (exact), resource_id (glob: *, ?, [seq]), account_id, region, reason.
# account_id and region narrow the match — omit to match any account/region.
# Document the reason for every exception — these are git-reviewable approvals.
exceptions:
  - rule_id: aws.ec2.instance.stopped
    resource_id: i-0abc1234567890def
    reason: "Bastion host — kept stopped intentionally, started on demand"
    expires_at: "2026-12-31"          # exception reviewed quarterly; auto-expires if not renewed

  - rule_id: aws.rds.instance.idle
    resource_id: db-prod-reporting
    reason: "Quarterly reporting DB — idle between cycles, do not terminate"

  - rule_id: aws.rds.instance.idle
    resource_id: "db-test-*"           # glob — suppress all test databases
    reason: "Test databases are ephemeral and frequently idle"

  - rule_id: aws.ebs.unattached
    resource_id: "vol-*"
    account_id: "111111111111"          # narrow to a specific AWS account
    region: us-west-2                   # narrow to a specific region
    reason: "Archive volumes in legacy account, migration planned"

  - rule_id: gcp.sql.instance.idle
    resource_id: my-project:us-central1:analytics-db
    reason: "Idle between quarterly runs"

# ── Thresholds ───────────────────────────────────────────────────────────────
# CI/CD exit-code policy. CLI --fail-on-* flags take precedence if both are set.
# fail_on_confidence uses signal strength (LOW/MEDIUM/HIGH), not override_risk_level.
#
# Exit codes:
#   0 — No findings (or all filtered/suppressed)
#   1 — Unexpected error (bug or infrastructure failure)
#   2 — Policy violation — a threshold was breached (CI/CD should fail here)
#   3 — Permission error — insufficient IAM/RBAC permissions to complete scan
#
# Evaluated in order (OR logic — first matching condition triggers exit 2):
#   1. fail_on_findings: true  → any remaining finding
#   2. fail_on_confidence: X   → any finding with confidence >= X
#   3. fail_on_cost: X         → total estimated_monthly_cost_usd across findings >= X
thresholds:
  fail_on_confidence: HIGH      # exit 2 if any HIGH confidence finding remains after filtering
  fail_on_cost: 500             # exit 2 if total estimated waste >= $500/month
  fail_on_findings: false       # don't fail on every finding — too noisy for most teams