openshift-eng · enxebre · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/.claude/rules/evals.md b/.claude/rules/evals.md
@@ -0,0 +1,19 @@
+When modifying plugin commands or skills, add or update evals in `plugins/<name>/evals/`.
+
+Every eval test must have per-test metadata:
+```yaml
+metadata:
+  token-usage: small | medium | large
+  judge-size: none | sonnet | opus
+  tier: fast | medium | heavy
+```
+
+Use YAML anchors (`&meta-fast` / `*meta-fast`) to avoid repetition.
+
+After adding or modifying evals:
+1. Run `make lint` — the skillsaw linter validates metadata, tier classification, and budget compliance against `evals/budget.yaml`
+2. If lint fails, run `make lint-fix` to auto-fix what it can
+3. Run `make eval-plugins EVAL_PLUGIN=<name>` to verify tests pass
+4. Update `evals/budget.yaml` budgets.current if cost thresholds changed
+
+See `evals/AGENTS.md` for the full tiering model and budget rules.
diff --git a/.github/workflows/eval-plugins.yml b/.github/workflows/eval-plugins.yml
@@ -9,22 +9,26 @@ env:
   CLAUDE_CODE_USE_VERTEX: 'true'
 
 jobs:
-  detect-changed-plugins:
+  detect-changes:
     runs-on: ubuntu-latest
     outputs:
       plugins: ${{ steps.detect.outputs.plugins }}
       has_plugins: ${{ steps.detect.outputs.has_plugins }}
+      has_contributing: ${{ steps.detect.outputs.has_contributing }}
     steps:
       - name: Checkout code
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
           persist-credentials: false
           fetch-depth: 0
 
-  detect-changes:
-    runs-on: ubuntu-latest
-    outputs:
-      plugins: ${{ steps.detect.outputs.plugins }}
-      has_plugins: ${{ steps.detect.outputs.has_plugins }}
-      has_contributing: ${{ steps.detect.outputs.has_contributing }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-        with:
-          persist-credentials: false
-          fetch-depth: 0
+  detect-changes:
+    permissions:
+      contents: read
+    runs-on: ubuntu-latest
+    outputs:
+      plugins: ${{ steps.detect.outputs.plugins }}
+      has_plugins: ${{ steps.detect.outputs.has_plugins }}
+      has_contributing: ${{ steps.detect.outputs.has_contributing }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: 0
-  detect-changes:
-    runs-on: ubuntu-latest
-    outputs:
-      plugins: ${{ steps.detect.outputs.plugins }}
-      has_plugins: ${{ steps.detect.outputs.has_plugins }}
-      has_contributing: ${{ steps.detect.outputs.has_contributing }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-        with:
-          persist-credentials: false
-          fetch-depth: 0
+  detect-changes:
+    permissions:
+      contents: read
+    runs-on: ubuntu-latest
+    outputs:
+      plugins: ${{ steps.detect.outputs.plugins }}
+      has_plugins: ${{ steps.detect.outputs.has_plugins }}
+      has_contributing: ${{ steps.detect.outputs.has_contributing }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: 0
-      - name: Detect changed plugins with evals
+      - name: Detect what changed
         id: detect
         run: |
-          plugins=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- plugins/ \
+          CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD)
+
+          # Detect changed plugins with evals
+          plugins=$(echo "$CHANGED" | grep '^plugins/' \
             | sed -n 's|^plugins/\([^/]*\)/.*|\1|p' \
             | sort -u \
             | while read plugin; do
@@ -39,16 +43,24 @@ jobs:
           fi
           echo "Changed plugins with evals: ${plugins}"
 
+          # Detect contributing-relevant changes
+          if echo "$CHANGED" | grep -qE '^(CLAUDE\.md|AGENTS\.md|\.claude/rules/|evals/promptfooconfig\.yaml)'; then
+            echo "has_contributing=true" >> "$GITHUB_OUTPUT"
+            echo "Contributing files changed — will run eval-contributing"
+          else
+            echo "has_contributing=false" >> "$GITHUB_OUTPUT"
+          fi
+
   behavioral-evals:
-    needs: detect-changed-plugins
-    if: needs.detect-changed-plugins.outputs.has_plugins == 'true'
+    needs: detect-changes
+    if: needs.detect-changes.outputs.has_plugins == 'true'
     runs-on: ubuntu-latest
     permissions:
       contents: read
     strategy:
       fail-fast: false
       matrix:
-        plugin: ${{ fromJson(needs.detect-changed-plugins.outputs.plugins) }}
+        plugin: ${{ fromJson(needs.detect-changes.outputs.plugins) }}
     steps:
       - name: Checkout code
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -80,3 +92,30 @@ jobs:
           name: eval-results-${{ matrix.plugin }}
           path: eval-results/*.xml
           retention-days: 30
+
+  contributing-evals:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.has_contributing == 'true'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - name: Setup Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        with:
+          node-version: '22'
+
+      - name: Authenticate to GCP
+        uses: google-github-actions/auth@ba79af03959ebeac9769e648f473a284504d9193 # v2.1.10
+        with:
+          credentials_json: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
+
+      - name: Run contributing evals
+        env:
+          ANTHROPIC_VERTEX_PROJECT_ID: ${{ secrets.ANTHROPIC_VERTEX_PROJECT_ID }}
+        run: make eval-contributing
diff --git a/.gitignore b/.gitignore
@@ -35,6 +35,7 @@ venv/
 # Claude's settings for this repository
 .claude/*
 !.claude/settings.json
+!.claude/rules/
 
 # Working directories for skills
 .work/

diff --git a/.skillsaw/promptfoo_budget_rule.py b/.skillsaw/promptfoo_budget_rule.py
@@ -237,7 +237,9 @@ def _entity_name(context: RepositoryContext, node: PromptfooConfigNode) -> Optio
         if parent_plugin is not None:
             return parent_plugin.path.name
 
-        return None
+        # Root-level configs (e.g. evals/promptfooconfig.yaml) use the
+        # "contributing" budget entity.
+        return "contributing"
 
     # ------------------------------------------------------------------
     # token-usage validation

diff --git a/AGENTS.md b/AGENTS.md
@@ -23,6 +23,8 @@ Canonical example: `plugins/hello-world/`
 | Command | When |
 |---------|------|
 | `make lint` | Before every commit — validates structure, format, and marketplace registration |
+| `make lint-fix` | After lint failures — auto-fixes what it can |
+| `make eval-plugins EVAL_PLUGIN=<name>` | After modifying a plugin's commands or skills — runs behavioral evals |
 | Bump `version` in `plugin.json` | When modifying plugin commands or skills (not README-only changes) |
 | `make update` | After version bumps — syncs marketplace.json and regenerates docs |
 
@@ -37,3 +39,4 @@ Canonical example: `plugins/hello-world/`
 - **Register all plugins** in [.claude-plugin/marketplace.json](.claude-plugin/marketplace.json).
 - **Set author** to `"github.com/openshift-eng"` in `plugin.json`.
 - **Add new commands** to an existing plugin when they fit its scope, or to `plugins/utils/` if no clear parent. Create a new plugin only for a distinct group of related commands.
+- **Add evals** for new or modified plugins. See [evals/AGENTS.md](evals/AGENTS.md) for how to write evals, tag metadata, and stay within budget.
diff --git a/Makefile b/Makefile
@@ -18,6 +18,10 @@ help: ## Show this help message
 lint: ## Run plugin linter (verbose, strict mode)
 	$(CONTAINER_RUNTIME) run --rm --platform linux/amd64 $(SELINUX_OPT) -v $(PWD):/workspace:Z $(SKILLSAW_IMAGE) .
 
+.PHONY: lint-fix
+lint-fix: ## Auto-fix lint violations
+	$(CONTAINER_RUNTIME) run --rm --platform linux/amd64 $(SELINUX_OPT) -v $(PWD):/workspace:Z $(SKILLSAW_IMAGE) fix -y .
+
 .PHONY: lint-pull
 lint-pull: ## Pull the latest skillsaw image
 	$(CONTAINER_RUNTIME) pull $(SKILLSAW_IMAGE)
@@ -59,4 +63,16 @@ $(EVAL_TARGETS):
 		--no-cache \
 		--table-cell-max-length 500
 
+.PHONY: eval-contributing
+eval-contributing: ## Run contributing workflow evals (root evals/promptfooconfig.yaml)
+	@npm install
+	@CLAUDE_CODE_USE_VERTEX=true \
+	PROMPTFOO_PASS_RATE_THRESHOLD=$(EVAL_PASS_RATE_THRESHOLD) \
+		npx promptfoo eval \
+		-c evals/promptfooconfig.yaml \
+		$(if $(EVAL_FILTER),--filter-pattern "$(EVAL_FILTER)") \
+		--repeat $(EVAL_REPEAT) \
+		--no-cache \
+		--table-cell-max-length 500
+
 .DEFAULT_GOAL := help
diff --git a/evals/AGENTS.md b/evals/AGENTS.md
@@ -245,6 +245,23 @@ npx promptfoo view
 
 The workflow requires `ANTHROPIC_VERTEX_PROJECT_ID` and `GOOGLE_APPLICATION_CREDENTIALS` as GitHub secrets.
 
+## Linting
+
+The skillsaw linter validates eval configs (metadata, tier classification, budget compliance) via a custom rule in `.skillsaw/promptfoo_budget_rule.py`. Always run the linter before committing:
+
+```bash
+# Lint
+make lint
+
+# If it fails, auto-fix what it can
+make lint-fix
+```
+
+The linter checks:
+- Every test has `token-usage`, `judge-size`, `tier` metadata
+- Token-usage and tier are correctly classified per `evals/budget.yaml` rules
+- Per-plugin cost stays within budget
+
 ## Adding Evals for a New Plugin
 
 1. Create `plugins/<name>/evals/<test-name>.yaml`
@@ -254,4 +271,5 @@ The workflow requires `ANTHROPIC_VERTEX_PROJECT_ID` and `GOOGLE_APPLICATION_CRED
 5. Add `cost` and `latency` thresholds in `defaultTest.assert` based on observed values (run once, then set 2-3x)
 6. Add per-test `metadata` with `token-usage`, `judge-size`, and `tier` — use YAML anchors to DRY
 7. For test data, create `plugins/<name>/evals/fixtures/` and reference via `file://fixtures/<name>.md`
-8. Run locally: `make eval-plugins EVAL_PLUGIN=<name>`
+8. Run `make lint` — fix any budget or metadata violations
+9. Run locally: `make eval-plugins EVAL_PLUGIN=<name>`
diff --git a/evals/budget.yaml b/evals/budget.yaml
@@ -67,6 +67,9 @@ budgets:
   jira:
     allowed: 7.00
     current: 6.00      # 3 tests × $1.20 + 4 tests × $0.60
+  contributing:
+    allowed: 1.00
+    current: 0.50      # 1 test × $0.50 (evals/promptfooconfig.yaml)
 
 # Linter validation rules:
 #

diff --git a/evals/promptfooconfig.yaml b/evals/promptfooconfig.yaml
@@ -1,38 +1,57 @@
-# Root eval config — smoke test to verify the provider works.
-# For full evals, use make eval-plugins or run individual plugin evals.
+# Root evals — contributing workflow validation.
+# These test the repo-level behavior, not individual plugins.
 
-description: "ai-helpers — provider smoke test"
+description: "ai-helpers — root evals"
 
 providers:
   - id: anthropic:claude-agent-sdk
-    label: smoke
+    label: ai-helpers
     config:
       model: claude-opus-4-6
-      plugins:
-        - type: local
-          path: ../plugins/hello-world
-      append_allowed_tools: ['Bash']
-      permission_mode: 'auto'
+      working_dir: ../
+      append_allowed_tools: ['Read', 'Grep', 'Glob']
+      permission_mode: 'default'
 
 prompts:
   - "{{prompt}}"
 
 defaultTest:
   options:
     provider:
-      id: vertex:claude-opus-4-6
+      id: vertex:claude-sonnet-4-6
       config:
         projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}"
         region: global
         temperature: 0
+  assert:
+    - type: latency
+      threshold: 30000
+    - type: cost
+      threshold: 0.50
 
 tests:
-  # skillsaw-disable promptfoo-budget
-  # skillsaw-disable promptfoo-assertions
-  # skillsaw-disable promptfoo-metadata
-  - description: "smoke/provider-loads"
+  - description: "contributing/new-plugin-plan — follows contributing rules"
+    metadata:
+      token-usage: small
+      judge-size: sonnet
+      tier: medium
     vars:
-      prompt: "Run /hello-world:echo"
+      prompt: |
+        I want to create a new plugin called "bye-world" with a single command
+        "farewell" that prints "Goodbye world" or "Goodbye <name>".
+        Do NOT create any files — only describe what you would do, what files
+        you would create, and what verification steps you would run.
     assert:
+      - type: llm-rubric
+        value: "The plan includes a verification section that mentions running make lint (or make lint-fix if lint fails)"
+      - type: llm-rubric
+        value: "The plan mentions adding evals or running make eval-plugins for the new plugin"
+      - type: llm-rubric
+        value: "The plan mentions creating a plugin.json with name, description, version, and author fields"
+      - type: llm-rubric
+        value: "The plan mentions registering the plugin in marketplace.json or running make update"
       - type: icontains
-        value: "Hello world"
+        value: "make lint"
+
+evaluateOptions:
+  maxConcurrency: 5