diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 353381929..dc2afd32d 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -246,6 +246,12 @@ "source": "./plugins/marketplace-ops", "description": "Maintenance commands for Claude Code plugin marketplaces", "version": "0.1.2" + }, + { + "name": "agentic-docs", + "source": "./plugins/agentic-docs", + "description": "Create and maintain AI-optimized documentation for OpenShift", + "version": "1.0.0" } ] } diff --git a/docs/data.json b/docs/data.json index 9da6dddca..317f4938d 100644 --- a/docs/data.json +++ b/docs/data.json @@ -1822,6 +1822,26 @@ "name": "marketplace-ops", "skills": [], "version": "0.1.2" + }, + { + "commands": [], + "description": "Create and maintain AI-optimized documentation for OpenShift", + "has_readme": true, + "hooks": [], + "name": "agentic-docs", + "skills": [ + { + "description": "Create lean component documentation for OpenShift repositories", + "id": "component", + "name": "component-docs" + }, + { + "description": "Update existing platform documentation with automatic gap detection in openshift/enhancements", + "id": "update-platform-docs", + "name": "update-platform-docs" + } + ], + "version": "1.0.0" } ] } \ No newline at end of file diff --git a/plugins/agentic-docs/.claude-plugin/plugin.json b/plugins/agentic-docs/.claude-plugin/plugin.json new file mode 100644 index 000000000..153132c0f --- /dev/null +++ b/plugins/agentic-docs/.claude-plugin/plugin.json @@ -0,0 +1,8 @@ +{ + "name": "agentic-docs", + "description": "Create and maintain AI-optimized documentation for OpenShift", + "version": "1.1.0", + "author": { + "name": "github.com/openshift-eng" + } +} diff --git a/plugins/agentic-docs/commands/evaluate.md b/plugins/agentic-docs/commands/evaluate.md new file mode 100644 index 000000000..dcc706fe7 --- /dev/null +++ b/plugins/agentic-docs/commands/evaluate.md @@ -0,0 +1,55 @@ +--- +description: Evaluate agentic documentation quality using promptfoo-based behavioral validation +argument-hint: "[repository-path]" +--- + +## Name +agentic-docs:evaluate + +## Synopsis +``` +/agentic-docs:evaluate [repository-path] +``` + +## Description +The `agentic-docs:evaluate` command evaluates documentation quality by testing whether AI agents naturally discover and correctly apply repository conventions without being explicitly told to read documentation. + +This command validates **documentation-first natural discovery behavior** using the OpenShift Enhancements Agentic Docs Evaluation framework. It measures: +- **Natural discovery**: Does the agent find documentation without instruction? +- **Correct navigation**: Does the agent follow documentation structure? +- **Pattern application**: Does the agent apply repository conventions correctly? +- **Anti-pattern rejection**: Does the agent reject incorrect patterns? + +The evaluation uses promptfoo to run assertions from `promptfooconfig.yaml` and generates detailed HTML reports with pass/fail grades. + +## Implementation +When this command is invoked, Claude will execute the `agentic-docs:evaluate` skill, which: +1. Loads evaluation configuration from `promptfooconfig.yaml` +2. Runs coding sub-agents with task descriptions (no explicit file instructions) +3. Evaluates whether agents naturally discovered and applied documentation +4. Generates graded results with pass/fail assertions +5. Creates HTML reports for review + +The skill maintains strict separation between coding agents (who must discover docs naturally) and evaluation agents (who grade the results). + +## Return Value +- Evaluation results with pass/fail grades for each test case +- HTML report showing which documentation was discovered and applied +- Metrics on natural discovery patterns + +## Examples + +1. **Evaluate current directory**: + ``` + /agentic-docs:evaluate + ``` + Evaluates documentation in the current working directory. + +2. **Evaluate specific repository**: + ``` + /agentic-docs:evaluate /path/to/openshift/repo + ``` + Evaluates documentation in the specified repository. + +## Arguments +- `repository-path` (optional): Path to the repository to evaluate. Defaults to current directory if not specified. diff --git a/plugins/agentic-docs/commands/generate-evals.md b/plugins/agentic-docs/commands/generate-evals.md new file mode 100644 index 000000000..f2010e211 --- /dev/null +++ b/plugins/agentic-docs/commands/generate-evals.md @@ -0,0 +1,54 @@ +--- +description: Generate repository-specific promptfoo evaluation suites for OpenShift documentation +argument-hint: "[repository-path]" +--- + +## Name +agentic-docs:generate-evals + +## Synopsis +``` +/agentic-docs:generate-evals [repository-path] +``` + +## Description +The `agentic-docs:generate-evals` command generates a tailored `promptfooconfig.yaml` evaluation suite for a specific OpenShift repository. Instead of using a generic evaluation configuration, it analyzes the repository's documentation structure, code patterns, and conventions to create repository-specific test cases. + +The generated evaluation suite tests whether AI agents can: +- Naturally discover repository documentation +- Apply repository-specific patterns correctly +- Follow established conventions without explicit instruction +- Reject anti-patterns specific to the repository + +This follows the OpenShift Enhancements Agentic Docs Evaluation framework, which emphasizes documentation-first natural discovery. + +## Implementation +When this command is invoked, Claude will execute the `agentic-docs:generate-evals` skill, which: +1. Analyzes repository documentation structure (CLAUDE.md, ai-docs/, ARCHITECTURE.md) +2. Identifies code patterns (API versions, operator patterns, controller structure) +3. Extracts repository-specific conventions +4. Generates test cases that validate natural documentation discovery +5. Creates `promptfooconfig.yaml` with assertions tailored to the repository +6. Saves configuration to the repository root + +## Return Value +- Generated `promptfooconfig.yaml` file in the repository root +- Test cases specific to the repository's patterns and conventions +- Assertions configured for natural discovery validation + +## Examples + +1. **Generate evals for current directory**: + ``` + /agentic-docs:generate-evals + ``` + Analyzes the current repository and generates `promptfooconfig.yaml`. + +2. **Generate evals for specific repository**: + ``` + /agentic-docs:generate-evals /path/to/openshift/repo + ``` + Analyzes the specified repository and generates tailored evaluation configuration. + +## Arguments +- `repository-path` (optional): Path to the target repository for analysis. Defaults to current directory if not specified. diff --git a/plugins/agentic-docs/skills/evaluate/SKILL.md b/plugins/agentic-docs/skills/evaluate/SKILL.md new file mode 100644 index 000000000..c03a3ddfe --- /dev/null +++ b/plugins/agentic-docs/skills/evaluate/SKILL.md @@ -0,0 +1,726 @@ +--- +name: agentic-docs:evaluate +description: "Evaluate agentic documentation quality using promptfoo-based behavioral validation with natural discovery testing. Use this skill when users want to test if AI agents can find and use documentation, run evaluation tests, check documentation quality, or see which documentation files are most accessed. Trigger on phrases like 'evaluate documentation', 'run evaluation', 'test the docs', 'check if agents can find the documentation', or 'see what files are accessed'." +trigger: /agentic-docs:evaluate +--- + +# Agentic-Docs: Evaluate + +**Trigger**: `/agentic-docs:evaluate` +**Purpose**: Evaluate documentation quality by running promptfoo test suite and analyzing results + +## Core Principle + +This evaluation validates **documentation-first natural discovery behavior**: + +> Agents are NOT told to read documentation. +> They must naturally discover documentation and apply guidance correctly. + +Tests measure: +- **Natural discovery**: Does agent find documentation without instruction? +- **Correct navigation**: Does agent follow documentation structure? +- **Pattern application**: Does agent apply repository conventions correctly? +- **Anti-pattern rejection**: Does agent reject incorrect patterns? + +## Architecture (Simplified) + +``` +┌─────────────────────────────────────────┐ +│ Main Agent (You - the skill executor) │ +│ │ +│ 1. Run bundled run-eval.sh script │ +│ 2. Capture promptfoo results │ +│ 3. Collect session metrics │ +│ 4. Spawn judge sub-agent │ +│ │ +│ Script: ${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh +└─────────────────────────────────────────┘ + │ + │ results + metrics + ↓ +┌─────────────────────────────────────────┐ +│ Judge Claude Sub-Agent │ +│ │ +│ 1. Analyze promptfoo results │ +│ 2. Analyze session metrics │ +│ 3. Generate comprehensive report │ +│ 4. Provide recommendations │ +└─────────────────────────────────────────┘ +``` + +## Prerequisites + +**Required files**: +- `promptfooconfig.yaml` in repository root (generated by `/agentic-docs:generate-evals`) + +**Environment**: +- Node.js 18+ (for promptfoo) +- ANTHROPIC_API_KEY or ANTHROPIC_VERTEX_PROJECT_ID + +## Implementation Workflow + +### Step 1: Pre-flight Checks + +Check for required files, validate configuration, and verify environment: + +```bash +# Check promptfooconfig.yaml exists +if [ ! -f promptfooconfig.yaml ]; then + echo "ERROR: Evaluation configuration not found" + echo "" + echo "The evaluation requires promptfooconfig.yaml to be present in the repository root." + echo "" + echo "To generate it, run:" + echo " /agentic-docs:generate-evals" + echo "" + echo "This will create a tailored evaluation suite for this repository." + exit 1 +fi + +# Check API credentials +if [ -z "$ANTHROPIC_API_KEY" ] && [ -z "$ANTHROPIC_VERTEX_PROJECT_ID" ]; then + echo "ERROR: No API credentials found" + echo "" + echo "Set one of:" + echo " export ANTHROPIC_API_KEY='your-key-here'" + echo " export ANTHROPIC_VERTEX_PROJECT_ID='your-project-id'" + exit 1 +fi +``` + +**Validate provider configuration** using the bundled validation script: + +```bash +# Run provider configuration validation +bash ${CLAUDE_PLUGIN_ROOT}/scripts/validate-provider.sh + +# Check exit code: +# 0 = valid configuration, proceed +# 1 = config file not found +# 2 = invalid provider format detected +``` + +**The validation script detects common issues**: + +1. **Incorrect Vertex AI format**: `vertex:anthropic:claude-...` + - Should be: `vertex:claude-...` (no "anthropic:" in the middle) + +2. **Incorrect Vertex AI config**: `apiKey: vertex://...` + - Should use: `projectId` and `region` instead + +3. **API format**: `anthropic:messages:claude-...` + - Should be: `anthropic:claude-...` (no "messages:") + +**Valid provider formats**: + +- Simple Anthropic API: `anthropic:claude-sonnet-4-6` +- Vertex AI: `vertex:claude-sonnet-4-6` (with `projectId` and `region` config) + +**If validation fails (exit code 2)**: The script outputs a detailed error message explaining: +- What issue was detected +- The current (incorrect) configuration +- The correct format to use (both Anthropic and Vertex AI options) +- How to fix it (edit or regenerate) +- Reference to HyperShift config for Vertex AI examples + +Display this error message to the user and STOP. Do not proceed to run evaluation. + +**If checks fail**: Display the error message to the user and STOP. Do not proceed to run evaluation. + +### Step 2: Run Promptfoo Evaluation + +**YOU (the main agent) run the evaluation** using the bundled script: + +```bash +# Navigate to repository +cd {{repository_path}} + +# Run the bundled evaluation script +bash ${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh + +# The script will: +# - Auto-detect API credentials (Vertex AI or Anthropic API) +# - Run promptfoo with proper environment setup +# - Output results to ./promptfoo-results.json +``` + +**Important**: +- Use `${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh` (the bundled script) +- Do NOT run `promptfoo eval` directly +- The script handles NVM/Node.js setup and proper configuration +- Wait for script to complete (may take 1-5 minutes for 43 tests) +- Results are written to `./promptfoo-results.json` + +**After execution completes**: +1. Read the JSON results: `cat promptfoo-results.json` +2. Check the terminal output for summary (passed/failed/errors) +3. Capture both JSON and summary for judge analysis + +### Step 3: Collect Session Metrics + +**YOU (the main agent) collect session metrics** from the current session: + +```bash +# Get current session info +SESSION_DIR=$(ls -td ~/.claude/projects/*/ | head -1) +SESSION_FILE=$(ls -t "$SESSION_DIR"/*.jsonl | head -1) + +# Run metrics collection using metrics plugin +if command -v python3 &> /dev/null && [ -f "${CLAUDE_PLUGIN_ROOT}/../../metrics/scripts/ai_docs_telemetry.py" ]; then + python3 ${CLAUDE_PLUGIN_ROOT}/../../metrics/scripts/ai_docs_telemetry.py \ + -session "$SESSION_FILE" > metrics.json +else + # Metrics plugin not available - continue without metrics + echo '{"warning": "metrics plugin not available", "files_accessed": []}' > metrics.json +fi +``` + +**Metrics to capture** (if available): +- Total tokens used in session +- Session duration +- Files accessed (ai-docs/, AGENTS.md, etc.) +- Entry points used +- Navigation patterns + +**Output**: `metrics.json` file (or warning if metrics unavailable) + +**Note**: If metrics collection fails, continue anyway - promptfoo results are the primary data. + +### Step 4: Spawn Judge Sub-Agent + +**YOU (the main agent) spawn the judge sub-agent** using the Agent tool: + +Use this exact pattern: + +```python +Agent( + description="Analyze evaluation results and generate report", + prompt=f""" +You are a judge sub-agent responsible for evaluating agentic documentation quality. + +You have been provided with: +1. Promptfoo test results (JSON format) +2. Session metrics (JSON format if available) + +Your task: Analyze the results and generate a comprehensive evaluation report. + +## Inputs + +### Promptfoo Results +{promptfoo_results_json} + +### Session Metrics +{session_metrics_json} + +## Evaluation Criteria + +### 1. Test Results Analysis + +**First, determine the error rate**: +- If >50% of tests are errors (not failures): This is a configuration problem, not a documentation problem +- Errors indicate: provider config wrong, API auth failed, or promptfoo setup issues +- Failures indicate: documentation gaps or quality issues + +**Analyze test results**: +- Count: passed, failed, errors +- Group by category (if categories are in test names/descriptions) +- Calculate pass rate: passed / (passed + failed) - exclude errors from quality calculation +- Identify patterns in failures vs errors + +**For each failing test**: +- Extract test name/description +- Identify what was expected vs actual +- Determine root cause (documentation gap, unclear guidance, missing reference) + +**For errors**: +- Identify error type (provider error, auth error, config error) +- Common error patterns +- Recommended fixes for errors (separate from doc improvements) + +### 2. Metrics Analysis (if available) + +If session metrics provided: +- Token usage: Normal/High/Low relative to test complexity +- Files accessed: Which docs were read during evaluation +- Entry points: Did agents find AGENTS.md or other entry points? +- Navigation patterns: Sequential or scattered access + +If metrics unavailable: +- Note that metrics analysis is skipped +- Focus entirely on promptfoo results + +### 3. Overall Assessment + +**Quality threshold** (for non-error tests): +- PASS: >90% pass rate with zero anti-pattern failures +- FAIL: <90% pass rate or any anti-pattern failures + +**Configuration health**: +- HEALTHY: <10% error rate +- NEEDS FIX: >10% error rate (fix config before judging doc quality) + +## Output Format + +Generate a detailed evaluation report in markdown format: + +```markdown +# Agentic Documentation Evaluation Report + +**Repository**: {{repository_name}} +**Date**: {{evaluation_date}} +**Evaluator**: Judge Claude Sub-Agent + +## Executive Summary + +### Results Overview +- **Total Tests**: X +- **Passed**: X (X%) +- **Failed**: X (X%) +- **Errors**: X (X%) + +### Configuration Health +- **Status**: HEALTHY / NEEDS FIX +- **Error Rate**: X% +- [If >10% errors] ⚠️ High error rate indicates configuration issues - fix these before evaluating documentation quality + +### Documentation Quality (excluding errors) +- **Pass Rate**: X% (passed / (passed + failed)) +- **Recommendation**: PASS / FAIL +- **Critical Issues**: N + +## Configuration Issues (if error rate >10%) + +[Only include this section if errors >10%] + +**Error Analysis**: +- **Error Count**: X/Y tests (X%) +- **Common Error Pattern**: [describe the error type] + +**Likely Causes**: +1. [Most likely cause based on error messages] +2. [Second likely cause] + +**Recommended Fixes**: +1. [Specific fix for configuration] +2. [Specific fix for API setup] + +**Action Required**: Fix configuration issues above, then re-run evaluation to assess documentation quality. + +--- + +## Test Results by Category + +[Group tests by category if identifiable from test names/descriptions] + +### Category: [Category Name] + +**Results**: X passed, Y failed, Z errors (Pass rate: X%) + +#### Passed Tests ✅ +- [Test name]: Brief note on what was validated + +#### Failed Tests ❌ +- **[Test name]** + - **Expected**: [what should have happened] + - **Actual**: [what actually happened] + - **Root Cause**: [why it failed - be specific] + - **Fix**: [what to change in documentation] + +#### Errored Tests ⚠️ +- **[Test name]**: [error type - config/provider/auth] + +[Repeat for each category] + +### Anti-Pattern Tests (Critical) + +**Status**: [ALL PASSED ✅ / FAILURES DETECTED ❌] + +[List anti-pattern test results separately - these are zero-tolerance] + +## Session Metrics + +[Only include if metrics were provided] + +| Metric | Value | Assessment | +|--------|-------|------------| +| Total Tokens | X | Normal/High/Low | +| Duration | X min | Efficient/Slow | +| Files Accessed | X | Good coverage/Missing key files | +| Entry Point | [first file accessed] | Optimal (AGENTS.md)/Sub-optimal | +| Navigation Pattern | [description] | Efficient/Scattered | + +**Key Files Accessed**: +- [List files that were read, with access counts if available] + +**Missing Files** (if any expected files weren't accessed): +- [List files that should have been accessed but weren't] + +## Documentation Quality Assessment + +[Only include this if error rate <10% - otherwise focus on fixing config first] + +### Strengths +- [What the documentation does well] +- [Evidence from passing tests] + +### Weaknesses +- [What needs improvement] +- [Evidence from failing tests] + +### Critical Gaps +- [Any anti-pattern failures or critical knowledge gaps] + +## Recommendations + +### Critical (Must Fix) +[Only include if there are critical issues] +- [Issue 1 that prevents PASS verdict] +- [Issue 2 that prevents PASS verdict] + +### High Priority +- [Important improvement 1] +- [Important improvement 2] + +### Medium Priority +- [Nice-to-have improvement 1] +- [Nice-to-have improvement 2] + +### Configuration Fixes (if errors >10%) +- [Config fix 1] +- [Config fix 2] + +## Conclusion + +[2-3 paragraphs summarizing the evaluation] + +**Configuration Status**: [HEALTHY / NEEDS FIX] +[If needs fix: Explain that doc quality cannot be properly assessed until config is fixed] + +**Documentation Quality**: [Only if config healthy: PASS / FAIL] +[If PASS: Summarize strengths] +[If FAIL: Summarize critical gaps] + +**Next Steps**: +1. [First action to take] +2. [Second action to take] +3. [Etc.] + +--- + +### Final Verdict + +[If config needs fix] +⚠️ **FIX CONFIGURATION FIRST**: Error rate of X% indicates configuration issues. Address these before re-evaluating documentation quality. + +[If config healthy and docs pass] +✅ **DOCUMENTATION PASSES**: Pass rate of X% meets the >90% threshold. Documentation enables effective AI agent behavior. + +[If config healthy but docs fail] +❌ **DOCUMENTATION NEEDS IMPROVEMENT**: Pass rate of X% below 90% threshold. Address the high-priority recommendations above. +``` + +## Important Guidelines + +1. **Distinguish errors from failures**: + - Errors = config/setup problems (provider, auth, promptfoo) + - Failures = documentation quality problems + - If >10% errors, focus on config fixes first + +2. **Be specific with evidence**: + - Quote actual error messages + - Reference specific test names + - Cite actual vs expected behavior + +3. **Categorize when possible**: + - Group related tests together + - Identify patterns in failures + - Separate anti-pattern tests (zero tolerance) + +4. **Actionable recommendations**: + - Each recommendation should be implementable + - Prioritize by impact + - Distinguish config fixes from doc improvements + +5. **Handle missing metrics gracefully**: + - If no metrics provided, skip metrics analysis + - Don't fail or complain - just focus on promptfoo results + +Generate the complete evaluation report now. +""" +) +``` + +**Variables to substitute**: +- `{promptfoo_results_json}`: Complete contents of `promptfoo-results.json` +- `{session_metrics_json}`: Complete contents of `metrics.json` +- `{{repository_name}}`: Name of repository being evaluated +- `{{evaluation_date}}`: Current date + +**Expected output from judge sub-agent**: +- Complete markdown evaluation report +- Analysis of all test results +- Distinction between config errors and doc failures +- Recommendations for improvements + +### Step 5: Display Evaluation Report + +**YOU (the main agent) display the judge's report** to the user: + +1. Show the complete markdown report from the judge +2. Highlight the final verdict (PASS/FAIL or NEEDS CONFIG FIX) +3. If there were configuration errors, emphasize that config must be fixed first +4. Point user to next steps from the recommendations + +**Example output**: +``` +Here are the evaluation results for your documentation: + +[Insert complete judge report here] + +--- + +**Summary**: [One-line summary of verdict] +**Next Step**: [Most important action from recommendations] +``` + +## Success Criteria + +**Configuration Health**: +- ✅ Healthy: <10% error rate +- ❌ Needs Fix: >10% error rate (fix config before judging docs) + +**Documentation Quality** (only if config healthy): +- ✅ PASS: >90% pass rate with zero anti-pattern failures +- ❌ FAIL: <90% pass rate or any anti-pattern failures + +**Any anti-pattern failure = FAIL** (zero tolerance) + +## Error Handling + +### Missing Config File + +If `promptfooconfig.yaml` missing: +``` +ERROR: Evaluation configuration not found + +The evaluation requires promptfooconfig.yaml to be present in the repository root. + +To generate it, run: + /agentic-docs:generate-evals + +This will create a tailored evaluation suite for this repository. +``` + +**Action**: Display error and STOP. Do not run evaluation. + +### Missing API Credentials + +If both `ANTHROPIC_API_KEY` and `ANTHROPIC_VERTEX_PROJECT_ID` are missing: +``` +ERROR: No API credentials found + +Set one of: + export ANTHROPIC_API_KEY='your-key-here' + export ANTHROPIC_VERTEX_PROJECT_ID='your-project-id' +``` + +**Action**: Display error and STOP. Do not run evaluation. + +### Promptfoo Execution Errors + +If `run-eval.sh` fails: +1. Check the error message from the script +2. Common issues: + - Node.js not installed → "Install Node.js 18+" + - Script permission denied → "Run: chmod +x scripts/run-eval.sh" + - Provider config wrong → "Check promptfooconfig.yaml provider format" +3. Display the specific error to user +4. Still spawn judge sub-agent with whatever results exist (even if partial) + +### High Error Rate (>10%) + +If promptfoo results show >10% errors: +- Judge will identify this as configuration problem +- Report will focus on fixing config issues +- Documentation quality assessment will be deferred until config is fixed + +### Metrics Collection Failure + +If metrics plugin unavailable or fails: +``` +Note: Session metrics unavailable - proceeding with promptfoo results only. +``` + +**Action**: Continue with evaluation using only promptfoo results. Judge will note metrics are unavailable. + +## Bundled Scripts + +This skill includes helper scripts in `scripts/` directory: + +### run-eval.sh + +**Purpose**: Run promptfoo evaluation with proper environment setup + +**Usage**: +```bash +# Run all tests +bash ${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh + +# Run tests matching pattern +bash ${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh "navigation" +``` + +**Features**: +- Auto-detects API credentials (Vertex AI or Anthropic API) +- Handles NVM/Node.js environment setup +- Runs promptfoo with correct configuration +- Outputs results to promptfoo-results.json + +**CRITICAL**: Always use this script. Do NOT run `promptfoo eval` directly. + +### Makefile + +**Purpose**: Convenience targets for manual usage (not used by skill) + +Available targets: +- `make eval` - Run all evaluations +- `make eval-view` - Open web UI to view results + +## Complete Example Workflow + +**User request**: +``` +I just created documentation for the multiarch-tuning-operator repository at +/Users/kpais/workspace/multiarch-tuning-operator. I already ran +/agentic-docs:generate-evals. Now I want to evaluate the documentation quality. +``` + +**Your execution**: + +```python +# Step 1: Pre-flight checks +repository_path = "/Users/kpais/workspace/multiarch-tuning-operator" + +# Check if promptfooconfig.yaml exists +if not file_exists(f"{repository_path}/promptfooconfig.yaml"): + print("ERROR: promptfooconfig.yaml not found...") + return + +# Check API credentials +if not ($ANTHROPIC_API_KEY or $ANTHROPIC_VERTEX_PROJECT_ID): + print("ERROR: No API credentials...") + return + +# Step 2: Run promptfoo evaluation +bash(f"cd {repository_path} && bash ${{CLAUDE_PLUGIN_ROOT}}/scripts/run-eval.sh") + +# Step 3: Collect results and metrics +promptfoo_results = read_file(f"{repository_path}/promptfoo-results.json") +metrics = collect_session_metrics() + +# Step 4: Spawn judge sub-agent +judge_report = Agent( + description="Analyze evaluation results", + prompt=f""" + You are a judge sub-agent... + + Promptfoo Results: {promptfoo_results} + Session Metrics: {metrics} + + [Full judge prompt from Step 4] + """ +) + +# Step 5: Display results +print(judge_report) +print("\n---\nNext step: [Most important recommendation]") +``` + +**Expected output**: +- Comprehensive evaluation report +- Test results: 43 tests with X passed, Y failed, Z errors +- Documentation quality verdict: PASS or FAIL +- Specific recommendations for improvements +- Session metrics analysis + +## Cost Estimate + +**Per evaluation** (simplified architecture): +- Main agent running promptfoo: ~$0.00 (bash command, no LLM calls) +- Promptfoo tests (43 tests): ~$0.15-0.30 (depends on test complexity) +- Metrics collection: ~$0.00 (local script) +- Judge sub-agent analysis: ~$0.05-0.10 (analyzes ~10-20K tokens) +- **Total**: ~$0.20-0.40 per evaluation + +**Savings vs two-agent architecture**: ~$0.02-0.05 (no code sub-agent overhead) + +## Related Commands + +- `/agentic-docs:generate-evals` - Generate promptfooconfig.yaml before evaluating +- `/agentic-docs:component` - Create component documentation to evaluate +- `/metrics:ai-docs-telemetry` - Analyze documentation usage patterns + +## Common Issues + +### Issue: All tests error (100% error rate) + +**Symptom**: Promptfoo reports 43 errors, 0 pass, 0 fail + +**Causes**: +1. **Provider configuration wrong** - Check promptfooconfig.yaml +2. **API authentication failed** - Verify API key/credentials +3. **Vertex AI project mismatch** - Check project ID + +**Fix from judge**: +Judge will identify this as >50% error rate and recommend configuration fixes before re-evaluating documentation. + +**Manual check**: +```bash +# Check provider format in promptfooconfig.yaml +grep -A2 "providers:" promptfooconfig.yaml + +# Should see: +# providers: +# - anthropic:claude-sonnet-4-6 + +# NOT: +# providers: +# - id: anthropic:messages:claude-sonnet-4-6 +``` + +### Issue: Script permission denied + +**Symptom**: `bash: run-eval.sh: Permission denied` + +**Fix**: +```bash +chmod +x ${CLAUDE_PLUGIN_ROOT}/scripts/run-eval.sh +``` + +### Issue: Node.js not found + +**Symptom**: `command not found: node` + +**Fix**: Install Node.js 18+ from nodejs.org or use nvm + +## Notes + +**CRITICAL WORKFLOW STEPS**: +1. ✅ Pre-flight checks (fail fast if missing prereqs) +2. ✅ Run run-eval.sh script directly (YOU execute it, not a sub-agent) +3. ✅ Collect session metrics (YOU collect them, not a sub-agent) +4. ✅ Spawn judge sub-agent with results + metrics +5. ✅ Display judge's report to user + +**DO NOT**: +- ❌ Spawn code sub-agent (not needed - YOU run the script) +- ❌ Run promptfoo commands directly (use bundled run-eval.sh) +- ❌ Continue if pre-flight checks fail +- ❌ Skip judge sub-agent (analysis is required) + +**Why this simplified architecture**: +- **Simpler**: One sub-agent instead of two +- **Faster**: No code sub-agent spawn overhead (~20-30 seconds saved) +- **Cheaper**: ~$0.02-0.05 savings per evaluation +- **Clearer**: Main agent runs tools, judge analyzes results +- **More reliable**: Fewer moving parts, fewer failure modes diff --git a/plugins/agentic-docs/skills/evaluate/evals/evals.json b/plugins/agentic-docs/skills/evaluate/evals/evals.json new file mode 100644 index 000000000..b7cd2fb39 --- /dev/null +++ b/plugins/agentic-docs/skills/evaluate/evals/evals.json @@ -0,0 +1,52 @@ +{ + "skill_name": "agentic-docs:evaluate", + "evals": [ + { + "id": 1, + "eval_name": "happy-path-evaluation", + "prompt": "I just created documentation for the multiarch-tuning-operator repository at /Users/kpais/kpais-workspace/claude-tmp/multiarch-tuning-operator-test-plugin. I ran /agentic-docs:generate-evals and it created a promptfooconfig.yaml file with 43 test cases. Now I want to run the evaluation to see if the documentation is good. Can you evaluate it?", + "expected_output": "Should spawn code sub-agent to run promptfoo, collect metrics, spawn judge sub-agent to analyze results, and produce comprehensive evaluation report", + "files": [], + "setup_required": "Repository with promptfooconfig.yaml, ANTHROPIC_API_KEY set", + "assertions": [ + {"name": "detected_invalid_provider_config", "description": "v6.1 should detect the invalid Vertex AI provider format in promptfooconfig.yaml"}, + {"name": "provided_fix_instructions", "description": "Should provide clear instructions on how to fix the provider configuration"}, + {"name": "referenced_generate_evals_skill", "description": "Should reference the generate-evals skill documentation for the correct format"}, + {"name": "did_not_run_promptfoo", "description": "Should NOT run promptfoo when invalid config is detected"}, + {"name": "clear_next_steps", "description": "Should provide clear next steps (edit config or regenerate)"}, + {"name": "v60_runs_without_validation", "description": "v6.0 (baseline) should attempt to run promptfoo and encounter API errors"} + ] + }, + { + "id": 2, + "eval_name": "missing-config-error", + "prompt": "I want to evaluate the agentic documentation in /tmp/test-repo but I haven't generated the evaluation config yet. What should I do?", + "expected_output": "Should detect missing promptfooconfig.yaml and provide helpful error message instructing user to run /agentic-docs:generate-evals first", + "files": [], + "setup_required": "Repository without promptfooconfig.yaml", + "assertions": [ + {"name": "detected_missing_config", "description": "Should detect that promptfooconfig.yaml is missing"}, + {"name": "showed_error_message", "description": "Should display the specific error message from the skill"}, + {"name": "instructed_generate_evals", "description": "Should instruct user to run /agentic-docs:generate-evals"}, + {"name": "did_not_spawn_subagents", "description": "Should NOT spawn code/judge sub-agents when config is missing"}, + {"name": "clear_next_steps", "description": "Should provide clear next steps for user"} + ] + }, + { + "id": 3, + "eval_name": "complex-multi-step-request", + "prompt": "okay so i just finished writing all the AGENTS.md and ai-docs/ stuff for our kubernetes operator. i want to make sure AI agents can actually find and use this documentation properly. i generated some test config earlier (i think it's called promptfooconfig.yaml or something?) and now i need to run those tests and see what passes/fails. also can you tell me which docs files are getting accessed the most? the repository is at ~/code/my-operator", + "expected_output": "Should recognize this as evaluation request, run promptfoo tests, collect session metrics showing file access patterns, and report both test results and documentation usage analytics", + "files": [], + "setup_required": "Repository with promptfooconfig.yaml, metrics plugin available", + "assertions": [ + {"name": "recognized_evaluation_request", "description": "Should recognize this casual request as an evaluation task"}, + {"name": "handled_uncertain_config_name", "description": "Should handle user's uncertainty about config filename"}, + {"name": "ran_promptfoo_tests", "description": "Should run promptfoo tests to answer 'see what passes/fails'"}, + {"name": "collected_file_access_metrics", "description": "Should collect and report file access metrics"}, + {"name": "answered_both_questions", "description": "Should address both test results AND file access metrics in response"}, + {"name": "resolved_tilde_path", "description": "Should correctly handle ~/code/my-operator path"} + ] + } + ] +} diff --git a/plugins/agentic-docs/skills/evaluate/scripts/run-eval.sh b/plugins/agentic-docs/skills/evaluate/scripts/run-eval.sh new file mode 100644 index 000000000..365aca783 --- /dev/null +++ b/plugins/agentic-docs/skills/evaluate/scripts/run-eval.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Quick evaluation runner for OpenShift Enhancements agentic docs + +set -e + +# Get script's directory and calculate repo root +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +REPO_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )" + +echo "=== OpenShift Enhancements Agentic Docs Evaluation (Promptfoo) ===" +echo + +# Load nvm if available +export NVM_DIR="$HOME/.nvm" +if [ -s "$NVM_DIR/nvm.sh" ]; then + source "$NVM_DIR/nvm.sh" + nvm use 22 &>/dev/null || true +fi + +# Check prerequisites +if ! command -v node &> /dev/null; then + echo "❌ Error: Node.js not found. Install Node.js 18+ first." + exit 1 +fi + +echo "✅ Prerequisites check passed" + +# Show which backend will be used +if [ -n "$ANTHROPIC_VERTEX_PROJECT_ID" ]; then + echo "ℹ️ Using Vertex AI (project: $ANTHROPIC_VERTEX_PROJECT_ID)" +elif [ "$CLAUDE_CODE_USE_VERTEX" = "true" ]; then + echo "ℹ️ Using Vertex AI (from ~/.config/claude/settings.json)" +elif [ -n "$ANTHROPIC_API_KEY" ]; then + echo "ℹ️ Using Anthropic API" +else + echo "⚠️ Warning: No API configuration detected" + echo " Set one of: ANTHROPIC_VERTEX_PROJECT_ID, CLAUDE_CODE_USE_VERTEX=true, or ANTHROPIC_API_KEY" +fi +echo + +# Run evaluation +echo "🚀 Running evaluations..." +echo + +# Change to repo root (where config and files are) +cd "$REPO_ROOT" + +if [ $# -eq 0 ]; then + # No arguments: run all tests + npx --yes promptfoo@latest eval -c promptfooconfig.yaml +else + # With arguments: filter tests by pattern + npx --yes promptfoo@latest eval -c promptfooconfig.yaml --filter-pattern "$1" +fi + +echo +echo "✅ Evaluation complete!" +echo +echo "💡 View detailed results:" +echo " make eval-view" \ No newline at end of file diff --git a/plugins/agentic-docs/skills/generate-evals/SKILL.md b/plugins/agentic-docs/skills/generate-evals/SKILL.md new file mode 100644 index 000000000..a4b168c5f --- /dev/null +++ b/plugins/agentic-docs/skills/generate-evals/SKILL.md @@ -0,0 +1,786 @@ +--- +name: agentic-docs:generate-evals +description: "Generate repository-specific promptfoo evaluation suites tailored to OpenShift conventions and repository patterns" +trigger: /agentic-docs:generate-evals +--- + +# Agentic-Docs: Generate Evals + +**Trigger**: `/agentic-docs:generate-evals` +**Purpose**: Generate repository-specific promptfoo evaluation configurations following OpenShift enhancement evaluation framework + +**Framework**: OpenShift Enhancements Agentic Docs Evaluation +**Reference**: https://github.com/openshift/enhancements/pull/1992 + +## Name + +agentic-docs:generate-evals - Generate repository-specific evaluation suites + +## Synopsis + +``` +/agentic-docs:generate-evals [] +``` + +## Description + +This skill generates a tailored `promptfooconfig.yaml` evaluation suite for a specific OpenShift repository **by adapting the canonical template** at `${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml`. + +**Template-First Approach**: +The skill reads the reference template and preserves its structure (extensions, providers, defaultTest) while replacing only the `tests` array with repository-specific scenarios. + +**Repository Analysis**: +It analyzes the target repository's: +- **Documentation structure** (CLAUDE.md, ai-docs/, ARCHITECTURE.md) +- **Code patterns** (API versions, operator patterns, controller structure) +- **Repository conventions** (enhancement process, graduation criteria, status conditions) +- **Technology stack** (Go, Python, operators, CRDs, webhooks) + +**Generated Test Categories**: +1. **Navigation tests** - Verify agents can discover and navigate repository documentation +2. **Authoring tests** - Verify agents can design features following repository patterns +3. **Convention/anti-pattern tests** - Verify agents reject approaches that violate repository conventions + +The generated configuration follows the exact format from the template (HyperShift-based evaluation framework). + +### Why Repository-Specific Evals? + +**Generic evals cannot validate**: +- Repository-specific API conventions +- Project-specific operator patterns +- Custom enhancement processes +- Unique architectural constraints +- Technology-specific best practices + +**Repository-specific evals validate**: +- Agent discovers THIS repository's documentation +- Agent applies THIS repository's conventions correctly +- Agent avoids THIS repository's anti-patterns + +## Implementation + +### CRITICAL: Always Use the Template + +**Base template location**: +``` +${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml +``` + +**MANDATORY STEPS**: + +1. **Read the template first**: + ```bash + cat ${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml + ``` + +2. **Use template as foundation** - Do NOT create promptfoo configs from scratch +3. **Preserve template structure** - Only modify the `tests` array and `description` +4. **Keep all template sections** - Extensions, providers, defaultTest unchanged + +### Template Structure + +The template demonstrates the canonical evaluation format: + +- **Extensions**: `file://hooks.js:extensionHook` for test lifecycle hooks +- **Providers**: `exec: ./run-agent.sh` for custom agent execution +- **DefaultTest**: Vertex AI provider with standard configuration +- **Tests**: LLM rubric-based assertions (no weight fields) +- **Naming**: `category/##-description` pattern +- **Variables**: Use `vars.prompt` (not `vars.task_description`) +- **Concurrency**: `evaluateOptions.maxConcurrency` setting + +**Template demonstrates**: +- Multi-agent testing patterns (agent-specific scenarios) +- Complex API design review scenarios +- Architectural anti-pattern detection +- Convention enforcement testing + +### Phase 1: Repository Analysis + +**Analyze repository structure**: + +1. **Documentation discovery**: + ```bash + # Check what documentation exists + [ -f CLAUDE.md ] + [ -d ai-docs/ ] + [ -f ARCHITECTURE.md ] + [ -f AGENTS.md ] + ``` + +2. **Code pattern analysis**: + ```bash + # Identify repository technology and patterns + find . -name "*.go" | head -5 # Go codebase? + find . -name "*_types.go" | head -5 # Kubernetes CRDs? + find . -name "operator.yaml" # Operator pattern? + grep -r "v1alpha1" --include="*.go" # API versioning? + ``` + +3. **Convention extraction**: + - Read CLAUDE.md for documented conventions + - Read ai-docs/ for API patterns, operator guidance + - Identify graduation requirements + - Find status condition standards + - Extract enhancement process + +### Phase 2: Navigation Test Generation + +Generate 2-3 navigation tests that verify agents can find repository-specific documentation. + +**Template structure** (following promptfooconfig.example.yaml format): +```yaml +- description: "navigation/01-" + vars: + agent: # Optional: if multi-agent setup + prompt: | + + assert: + - type: llm-rubric + value: "The output references for guidance" + - type: llm-rubric + value: "The output identifies the correct location: " + - type: llm-rubric + value: "The output demonstrates understanding of " +``` + +**Example generation logic**: + +If repository has operator patterns: +```yaml +- description: "Navigation: Agent locates operator reconciliation documentation" + vars: + task_description: "How do I implement a new controller reconciliation loop in this codebase?" +``` + +If repository has CRD definitions: +```yaml +- description: "Navigation: Agent finds CRD development guidelines" + vars: + task_description: "Where can I find guidance on defining new Custom Resource Definitions?" +``` + +### Phase 3: Authoring Test Generation + +Generate 1-2 enhancement authoring tests using repository-specific scenarios. + +**Generation approach**: + +1. **Identify repository domain**: + - Networking? Storage? Security? Monitoring? + - Example: If repo = cluster-network-operator → networking domain + +2. **Generate fictional enhancement**: + - Must fit repository domain + - Should exercise documented patterns + - Should require following conventions + +**Template structure** (following promptfooconfig.example.yaml format): +```yaml +- description: "authoring/01--design" + vars: + agent: # Optional: if multi-agent setup + prompt: | + Design a new enhancement for "" that . + Include API design, architecture, and graduation criteria + following repository conventions. + assert: + - type: llm-rubric + value: "The API design starts with v1alpha1, not v1" + - type: llm-rubric + value: "The output includes " + - type: llm-rubric + value: "The design follows " +``` + +**Example for operator repository** (following template format): +```yaml +- description: "authoring/01-network-policy-automation" + vars: + prompt: | + Design a new enhancement for "NetworkPolicyAutomation" that automatically + generates network policies based on service discovery. Include API design, + operator architecture, and graduation criteria following repository conventions. + assert: + - type: llm-rubric + value: "The design follows standard Kubernetes operator patterns with controllers and reconciliation loops" + - type: llm-rubric + value: "The API design starts with v1alpha1, not v1" +``` + +### Phase 4: Anti-Pattern Test Generation + +Generate 3-5 anti-pattern tests based on repository-specific conventions. + +**Standard anti-patterns** (following promptfooconfig.example.yaml format): + +1. **API versioning**: + ```yaml + - description: "conventions/01-api-versioning" + vars: + prompt: | + Review: "We should create a new starting at v1." + Is this correct? + assert: + - type: llm-rubric + value: "The output rejects starting new APIs at v1 and recommends v1alpha1" + ``` + +2. **Status conditions** (for operator repos): + ```yaml + - description: "conventions/02-status-conditions" + vars: + prompt: | + Review: "Our operator will use custom conditions: Ready, Healthy, Operating." + Is this correct? + assert: + - type: llm-rubric + value: "The output rejects custom status conditions and references standard ClusterOperator conditions" + ``` + +3. **Breaking changes**: + ```yaml + - description: "conventions/03-breaking-changes" + vars: + prompt: | + Review: "Let's rename field 'oldName' to 'newName' in v1beta1." + Is this correct? + assert: + - type: llm-rubric + value: "The output rejects breaking changes without deprecation period" + ``` + +**Repository-specific anti-patterns**: + +Extract from CLAUDE.md or ai-docs/ sections that say: +- "Never..." +- "Do not..." +- "Avoid..." +- "Must not..." + +**Example extraction** (following template format): +``` +CLAUDE.md contains: "Never expose secrets in CRD spec fields" +↓ +Generate anti-pattern test: +- description: "conventions/04-secret-exposure" + vars: + prompt: | + Review: "We should add a 'password' field to the CRD spec for convenience." + Is this correct? + assert: + - type: llm-rubric + value: "The output rejects exposing secrets in CRD spec and suggests SecretReference pattern" +``` + +### Phase 5: Generate promptfooconfig.yaml + +**CRITICAL**: Generate the configuration by reading and adapting the template: + +```bash +# Read the canonical template +cat ${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml +``` + +**Template adaptation steps**: + +1. **Copy template structure**: + - Preserve `extensions` and `providers` sections exactly + - Keep `defaultTest` configuration + - Maintain `evaluateOptions.maxConcurrency` setting + +2. **Update description**: + ```yaml + description: " - Agentic Documentation Evaluation" + ``` + +3. **Adapt prompts section**: + ```yaml + prompts: + - "{{prompt}}" # Use template variable for dynamic test prompts + ``` + +4. **Replace tests array** with generated test cases: + ```yaml + tests: + # Navigation tests (2-3 generated from Phase 2) + - description: "Navigation: " + vars: + agent: # If multi-agent setup + prompt: | + + assert: + - type: llm-rubric + value: "" + + # Authoring tests (1-2 generated from Phase 3) + - description: "Authoring: " + vars: + prompt: | + + assert: + - type: llm-rubric + value: "" + + # Anti-pattern tests (3-5 generated from Phase 4) + - description: "Anti-pattern: " + vars: + prompt: | + + assert: + - type: llm-rubric + value: "" + ``` + +5. **Preserve provider configuration**: + - Keep `exec: ./run-agent.sh` if target repo has agent setup + - Keep Vertex AI `defaultTest` provider (standard for OpenShift repos) + - Do NOT modify provider format - use template exactly + +**Key template elements to preserve**: + +```yaml +extensions: + - file://hooks.js:extensionHook # Keep if hooks.js exists in target repo + +providers: + - id: "exec: ./run-agent.sh" # Keep if run-agent.sh exists + label: claude + +defaultTest: + options: + provider: + id: vertex:claude-opus-4-6 # Use template provider config + config: + projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}" + region: global + temperature: 0 + +evaluateOptions: + maxConcurrency: 6 # Adjust based on test count +``` + +**Write to**: `/promptfooconfig.yaml` + +**Supporting files** (if not present, create them): +- `run-agent.sh` - Agent execution wrapper (copy from template repo if needed) +- `hooks.js` - Pre/post test hooks (copy from template repo if needed) + +### Template Adaptation Example + +**From template** (preserve these sections): +```yaml +description: "HyperShift agent and convention evals" + +extensions: + - file://hooks.js:extensionHook + +providers: + - id: "exec: ./run-agent.sh" + label: claude + +prompts: + - "{{prompt}}" + +defaultTest: + options: + provider: + id: vertex:claude-opus-4-6 + config: + projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}" + region: global + temperature: 0 + +tests: + # ... (template tests here) + +evaluateOptions: + maxConcurrency: 6 +``` + +**To repository-specific** (only change description and tests): +```yaml +description: "cluster-network-operator - Agentic Documentation Evaluation" # ✓ Changed + +extensions: + - file://hooks.js:extensionHook # ✓ Preserved + +providers: + - id: "exec: ./run-agent.sh" # ✓ Preserved + label: claude + +prompts: + - "{{prompt}}" # ✓ Preserved + +defaultTest: + options: + provider: + id: vertex:claude-opus-4-6 # ✓ Preserved + config: + projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}" + region: global + temperature: 0 + +tests: + # ✓ REPLACED with repository-specific tests + - description: "navigation/01-operator-pattern-discovery" + vars: + prompt: | + How do I implement a new controller reconciliation loop in this codebase? + assert: + - type: llm-rubric + value: "The output references ai-docs/OPERATORS.md for guidance" + + - description: "conventions/01-api-versioning" + vars: + prompt: | + Review: "We should create a new NetworkPolicy API starting at v1." + Is this correct? + assert: + - type: llm-rubric + value: "The output rejects starting new APIs at v1 and recommends v1alpha1" + +evaluateOptions: + maxConcurrency: 2 # ✓ Adjusted for test count +``` + +### Common Template Mistakes to Avoid + +❌ **DO NOT create promptfoo configs from scratch** - Always start with the template + +❌ **DO NOT modify provider configuration** - Use template's Vertex AI setup exactly: +```yaml +# Keep this from template: +defaultTest: + options: + provider: + id: vertex:claude-opus-4-6 + config: + projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}" + region: global + temperature: 0 +``` + +❌ **DO NOT add weight fields** to assertions - Template doesn't use them: +```yaml +# WRONG: +assert: + - type: llm-rubric + value: "Criteria" + weight: 3.0 # ❌ Don't add weights + +# CORRECT: +assert: + - type: llm-rubric + value: "Criteria" # ✅ No weight field +``` + +❌ **DO NOT use old assertion types** - Template uses `llm-rubric` primarily: +```yaml +# WRONG: +assert: + - type: icontains # ❌ Don't use string matching + value: "CLAUDE.md" + +# CORRECT: +assert: + - type: llm-rubric # ✅ Use LLM-based evaluation + value: "The output references CLAUDE.md for guidance" +``` + +❌ **DO NOT use `vars.task_description`** - Template uses `vars.prompt`: +```yaml +# WRONG: +vars: + task_description: "Question" # ❌ Old pattern + +# CORRECT: +vars: + prompt: "Question" # ✅ Template pattern +``` + +✅ **DO preserve these template sections exactly**: +- `extensions` - Test lifecycle hooks +- `providers` - Agent execution configuration +- `defaultTest` - Vertex AI provider config +- `evaluateOptions` - Concurrency settings + +### Phase 6: Generate Evaluation Documentation + +Create `/EVALUATION.md`: + +```markdown +# Evaluation Suite + +This repository uses promptfoo-based evaluation to validate agentic documentation quality. + +## Generated Evaluation Scenarios + +### Navigation Tests () +- +- + +### Authoring Tests () +- + +### Anti-Pattern Tests () +- +- +- + +## Running Evaluations + +```bash +# All tests +make eval + +# By category +make eval-navigation +make eval-authoring +make eval-anti-pattern + +# View results +make eval-view +``` + +## Customizing + +To add repository-specific evaluation scenarios: + +1. Edit `promptfooconfig.yaml` +2. Add new test under appropriate category +3. Follow the existing format and assertion structure +4. Run `make eval` to validate + +## Regenerating + +To regenerate evaluation suite after documentation changes: + +```bash +/agentic-docs:generate-evals +``` + +This will analyze current repository state and update evaluation scenarios. +``` + +## Return Value + +**Success**: +``` +✅ Generated repository-specific evaluation suite + +Navigation tests: +Authoring tests: +Anti-pattern tests: + +Generated files: + • promptfooconfig.yaml - Evaluation configuration + • EVALUATION.md - Evaluation documentation + +Run evaluations: make eval +View results: make eval-view +``` + +**Failure**: +``` +❌ Evaluation generation failed + +Reason: + +Ensure: + • Target repository has CLAUDE.md or ai-docs/ + • Target repository contains code to analyze + • Running from target repository directory +``` + +## Examples + +### Example 1: Generate evals for operator repository + +**Input**: +``` +/agentic-docs:generate-evals /path/to/cluster-network-operator +``` + +**Analysis phase**: +``` +Analyzing repository structure... +✓ Found CLAUDE.md +✓ Found ai-docs/ directory (12 files) +✓ Found Go codebase (245 .go files) +✓ Identified operator pattern (CRDs, controllers) +✓ Found API versioning (v1alpha1, v1) +✓ Extracted conventions from ai-docs/OPERATORS.md +``` + +**Generation phase**: +``` +Reading template: ${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml +✓ Template loaded (160 lines) +✓ Preserving extensions, providers, defaultTest sections + +Generating navigation tests... + ✓ navigation/01-operator-pattern-discovery + ✓ navigation/02-controller-reconciliation-guidance + +Generating authoring tests... + ✓ authoring/01-network-policy-automation + +Generating anti-pattern tests... + ✓ conventions/01-api-versioning + ✓ conventions/02-status-conditions + ✓ conventions/03-breaking-changes + ✓ conventions/04-sync-network-calls + ✓ conventions/05-secret-exposure + +Adapting template structure... +✓ Updated description field +✓ Replaced tests array (7 tests) +✓ Preserved provider configuration +✓ Set maxConcurrency: 7 +``` + +**Output**: +``` +✅ Generated evaluation suite + +6 scenarios created: + • 2 navigation tests + • 1 authoring test + • 5 anti-pattern tests (3 standard + 2 repository-specific) + +Files created: + • promptfooconfig.yaml (generated from repository analysis) + • EVALUATION.md (evaluation documentation) + +Run: make eval +``` + +### Example 2: Automatically invoked after documentation creation + +**User runs**: +``` +/agentic-docs:create /path/to/my-operator +``` + +**Skill execution**: +``` +[agentic-docs:create running...] +✓ Documentation generated + +[Auto-invoking agentic-docs:generate-evals...] + +Generating repository-specific evaluation suite... +✓ Navigation tests: 2 +✓ Authoring tests: 1 +✓ Anti-pattern tests: 4 + +Evaluation suite ready: make eval +``` + +## Arguments + +### `` + +**Optional** - Path to target repository being documented + +**Default**: Current directory (`.`) + +**Examples**: +```bash +# Current directory +/agentic-docs:generate-evals + +# Specific repository +/agentic-docs:generate-evals /path/to/repo + +# After documentation creation (automatic) +# No arguments needed - uses same path as create command +``` + +## Integration with agentic-docs:create + +This skill is **automatically invoked** at the end of `/agentic-docs:create`: + +``` +/agentic-docs:create → [generates documentation] → /agentic-docs:generate-evals +``` + +**Auto-invocation behavior**: +- Uses same repository path as create command +- Runs after all documentation is generated +- Analyzes newly created ai-docs/ content +- Generates evaluation suite based on documentation +- No user interaction required + +**Disabling auto-invocation**: +``` +/agentic-docs:create --skip-evals +``` + +## Quality Criteria + +Generated evaluation suites must: + +1. **Follow template structure exactly**: + - Read `${CLAUDE_PLUGIN_ROOT}/skills/generate-evals/templates/promptfooconfig.example.yaml` + - Preserve `extensions`, `providers`, and `defaultTest` sections + - Use `llm-rubric` assertions (primary assertion type in template) + - Follow test naming convention: `category/##-description` + - Use `vars.prompt` for test input (not `vars.task_description`) + - Do NOT add `weight` fields to assertions (not used in template) + +2. **Be repository-specific**: + - Reference actual documentation paths in rubric criteria + - Test actual repository patterns + - Use domain-appropriate examples + - Extract real conventions from docs + +3. **Cover all categories**: + - Minimum 2 navigation tests + - Minimum 1 authoring test + - Minimum 3 convention/anti-pattern tests (standard set) + - Additional repository-specific anti-patterns + +4. **Be executable**: + - promptfooconfig.yaml runs without errors + - All `llm-rubric` assertions have clear success criteria + - Prompts are unambiguous + - Expected outcomes are achievable + +5. **Match template format**: + - Use Vertex AI provider configuration from template + - Include `evaluateOptions.maxConcurrency` setting + - Preserve `temperature: 0` for deterministic evaluation + - Use `file://` references for external files if needed + +## Limitations + +**Cannot generate evaluations for**: +- Repositories without any documentation +- Repositories without clear conventions +- Non-OpenShift repositories (patterns may not apply) + +**Requires**: +- CLAUDE.md or ai-docs/ exists +- Repository follows OpenShift/Kubernetes patterns +- Code is analyzable (Go, Python, YAML) + +## Version History + +**v2.0** (2026-05-15): +- **Template-first approach**: Always use `templates/promptfooconfig.example.yaml` as base +- Use `llm-rubric` assertions (template pattern) +- Use `vars.prompt` instead of `vars.task_description` +- Preserve template's extensions, providers, defaultTest sections +- Follow `category/##-description` naming convention +- Remove weight fields from assertions (not in template) +- Document common template mistakes to avoid + +**v1.0** (2026-05-14): +- Initial repository-specific evaluation generation +- Auto-invocation after agentic-docs:create +- Three test categories (navigation, authoring, anti-pattern) +- Standard + repository-specific anti-patterns +- promptfooconfig.yaml generation +- EVALUATION.md documentation generation diff --git a/plugins/agentic-docs/skills/generate-evals/templates/promptfooconfig.example.yaml b/plugins/agentic-docs/skills/generate-evals/templates/promptfooconfig.example.yaml new file mode 100644 index 000000000..2f91ab228 --- /dev/null +++ b/plugins/agentic-docs/skills/generate-evals/templates/promptfooconfig.example.yaml @@ -0,0 +1,160 @@ +description: "HyperShift agent and convention evals" + +extensions: + - file://hooks.js:extensionHook + +providers: + - id: "exec: ./run-agent.sh" + label: claude + +prompts: + - "{{prompt}}" + +defaultTest: + options: + provider: + id: vertex:claude-opus-4-6 + config: + projectId: "{{ env.ANTHROPIC_VERTEX_PROJECT_ID }}" + region: global + temperature: 0 + +tests: + # --- api-sme --- + - description: "api-sme/01-api-design-review" + vars: + agent: api-sme + tools: "Bash,Read,Grep,Glob" + patchFile: "../eval/testdata/sme-agents/api-sme/01-api-design-review/patch.diff" + prompt: | + I've added new foo fields to HostedClusterSpec in + api/hypershift/v1beta1/hostedcluster_types.go. The code is already + on disk. Please review the change. + assert: + - type: llm-rubric + value: "The output identifies that Foo_IP should use Go PascalCase naming (no underscores)" + - type: llm-rubric + value: "The output identifies that JSON tags must use lowerCamelCase (not snake_case or PascalCase)" + - type: llm-rubric + value: "The output identifies missing omitempty or omitzero on every field" + - type: llm-rubric + value: "The output identifies missing IP address format validation (CEL or kubebuilder)" + - type: llm-rubric + value: "The output identifies that FooConfig should not be a pointer — use value type with omitzero instead" + - type: llm-rubric + value: "The output identifies missing +listType marker on slice field for server-side apply" + - type: llm-rubric + value: "The output identifies that FooID immutability rule is incomplete — self == oldSelf either blocks initial set or allows remove-then-set bypass on optional fields" + - type: llm-rubric + value: "The output identifies missing +optional or +required markers on fields" + - type: llm-rubric + value: "The output identifies that fields sharing a common prefix should be consolidated into a single struct rather than scattered on the parent spec" + + # --- cloud-provider-sme --- + - description: "cloud-provider-sme/01-kms-integration" + vars: + agent: cloud-provider-sme + prompt: | + We want to implement customer-managed encryption key support for + etcd data at rest in hosted control planes. The feature should work + across AWS and Azure. How should we design this in HyperShift? + What API changes and controller logic are needed? + assert: + - type: llm-rubric + value: "The output mentions platform-specific KMS services (AWS KMS and Azure Key Vault)" + - type: llm-rubric + value: "The output proposes an API-level abstraction for cross-platform KMS configuration" + - type: llm-rubric + value: "The output addresses IAM or credential requirements for KMS access" + - type: llm-rubric + value: "The output references Kubernetes EncryptionConfiguration or etcd encryption provider mechanism" + + # --- control-plane-sme --- + - description: "control-plane-sme/01-ho-cpo-version-skew" + vars: + agent: control-plane-sme + prompt: | + We want to add a new control plane component called "policy-engine" + that enforces admission policies on the hosted cluster. The + component needs to behave differently depending on the OCP version + of the hosted control plane — in 4.18+ it should use + ValidatingAdmissionPolicy (native K8s), but in 4.17 and below it + should fall back to a webhook-based approach. + + The HyperShift Operator needs to know which variant to configure + when reconciling the HostedCluster, and the CPO needs to deploy + the right version of the component. + + How should we implement this considering HyperShift's versioning + model and the HO/CPO version skew constraints? + assert: + - type: llm-rubric + value: "The output references the cpov2 or controlplane-component framework for deploying the component" + - type: llm-rubric + value: "The output states that version-dependent behavior should be decided in the CPO based on the hosted cluster release version, not in the HO" + - type: llm-rubric + value: "The output explains that HO and CPO can run different versions and the HO must not assume which CPO version is running" + - type: llm-rubric + value: "The output states that the CPO image is part of the OCP release payload and matches the hosted cluster version" + - type: llm-rubric + value: "The output considers impact on control plane resource footprint (CPU, memory)" + + # --- data-plane-sme --- + - description: "data-plane-sme/01-spot-instance-lifecycle" + vars: + agent: data-plane-sme + prompt: | + We want to improve spot/preemptible instance support in NodePools. + Currently users can request spot instances on AWS, but we want to + ensure consistent behavior across platforms. How should the NodePool + API and controllers handle instance interruption events, and what + changes are needed for the data plane upgrade flow to account for + spot instance characteristics? + assert: + - type: llm-rubric + value: "The output discusses NodePool API abstraction for spot across platforms (AWS Spot, Azure Spot VMs, GCP Preemptible/Spot)" + - type: llm-rubric + value: "The output addresses instance interruption lifecycle (node drain, workload rescheduling, machine replacement)" + - type: llm-rubric + value: "The output considers impact of spot instances on rolling upgrade strategy" + - type: llm-rubric + value: "The output references ClusterAPI (CAPI) resources or controllers (MachineSet, MachineDeployment, Machine)" + + # --- hcp-architect-sme --- + - description: "hcp-architect-sme/01-architectural-review" + vars: + agent: hcp-architect-sme + prompt: | + We are considering a design where the hosted cluster's worker + nodes send status updates directly to the hypershift-operator in + the management cluster via a webhook. The worker node would call + a REST endpoint on the hypershift-operator to report node health + metrics. This way we get real-time health data without polling. + + What do you think of this approach? + assert: + - type: llm-rubric + value: "The output flags violation of unidirectional communication principle (management to hosted, never reverse)" + - type: llm-rubric + value: "The output raises security or tenant isolation concerns" + - type: llm-rubric + value: "The output suggests an alternative architecture that respects unidirectional communication" + + # --- conventions --- + - description: "conventions/01-go-test-style" + vars: + prompt: | + Write a unit test for a function called ParseMaintenanceWindow that + takes a cron string and duration in minutes, and returns a + MaintenanceWindow struct or an error. It should reject empty cron + strings, durations less than 30 minutes, and durations greater than + 480 minutes. It should accept valid inputs like "0 2 * * 6" with + duration 120. Just write the test, not the function itself. + assert: + - type: llm-rubric + value: "The generated test code uses Gherkin syntax with 'When... it should...' pattern in test names" + - type: llm-rubric + value: "The generated test code uses gomega matchers for assertions (Expect, BeTrue, BeFalse, HaveOccurred, etc.) rather than standard testing package assertions" + +evaluateOptions: + maxConcurrency: 6 \ No newline at end of file diff --git a/plugins/metrics/README.md b/plugins/metrics/README.md index 9d10f5ea1..1a99d5333 100644 --- a/plugins/metrics/README.md +++ b/plugins/metrics/README.md @@ -7,9 +7,31 @@ Anonymous usage metrics collection for ai-helpers slash commands, skills, and se The `metrics` plugin provides anonymous usage tracking for: - **Events**: Individual slash commands and skill invocations - **Sessions**: Aggregate session-level metrics (duration, tool usage, conversation patterns) +- **AI Docs Usage**: Track how agentic documentation is used during development This helps maintainers understand usage patterns and make data-driven decisions about feature development and improvements. +## Commands + +### `/metrics:ai-docs-telemetry` + +Analyze Claude Code session logs to track ai-docs usage patterns. See [ai-docs-telemetry.md](commands/ai-docs-telemetry.md) for full documentation. + +**Quick examples:** +```bash +# Scan all recent sessions +/metrics:ai-docs-telemetry -scan + +# Scan only enhancements repo +/metrics:ai-docs-telemetry -scan -project enhancements + +# Analyze specific session +/metrics:ai-docs-telemetry -session ~/.claude/projects//.jsonl + +# Pipe to jq for analysis +/metrics:ai-docs-telemetry -scan | jq -r '.[] | "\(.documentation.entry_point): \(.documentation.total_files)"' +``` + ## How It Works The plugin uses Claude Code's [hook system](https://docs.claude.com/en/docs/claude-code/hooks) to automatically track usage: @@ -305,7 +327,9 @@ All metrics collection logic is open source and available in this repository: - **Hook definition**: `plugins/metrics/hooks/hooks.json` - **Event collection script**: `plugins/metrics/scripts/send_metrics.py` - **Session collection script**: `plugins/metrics/scripts/send_session_metrics.py` +- **AI docs telemetry script**: `plugins/metrics/scripts/ai_docs_telemetry.py` - **Plugin metadata**: `plugins/metrics/.claude-plugin/plugin.json` +- **Commands**: `plugins/metrics/commands/` ## Data Usage diff --git a/plugins/metrics/commands/ai-docs-telemetry.md b/plugins/metrics/commands/ai-docs-telemetry.md new file mode 100644 index 000000000..01d47d443 --- /dev/null +++ b/plugins/metrics/commands/ai-docs-telemetry.md @@ -0,0 +1,97 @@ +--- +description: Analyze Claude Code session logs for ai-docs usage patterns +argument-hint: "[-scan] [-project ] [-session ]" +--- + +## Name +metrics:ai-docs-telemetry + +## Synopsis +``` +/metrics:ai-docs-telemetry -scan [-project ] +/metrics:ai-docs-telemetry -session +``` + +## Description +The `metrics:ai-docs-telemetry` command analyzes Claude Code session logs to track how agentic documentation (ai-docs) is used during development. It parses session JSONL files to extract Read tool calls to ai-docs files and generates telemetry events. + +This helps measure: +- Documentation effectiveness and usage patterns +- Which files are accessed most frequently +- Entry points for documentation discovery (AGENTS.md, direct search, etc.) +- Navigation paths through documentation + +All output is JSON to stdout, making it easy to pipe to `jq` for analysis. + +## Implementation +```python +${CLAUDE_PLUGIN_ROOT}/scripts/ai_docs_telemetry.py "$@" +``` + +The script: +- Parses `~/.claude/projects/` JSONL files +- Detects Read tool calls to files matching `ai-docs/`, `AGENTS.md`, or `CLAUDE.md` +- Tracks access sequence and timestamps +- Identifies entry points (AGENTS.md vs direct search) +- Privacy-first: Only file paths tracked, no code/prompts/user data + +## Return Value +- **JSON**: Single event or array of events +- **Summary**: Printed to stderr with session counts + +## Examples + +1. **Scan all recent sessions (last 7 days)**: + ``` + /metrics:ai-docs-telemetry -scan + ``` + Output: + ```json + [ + { + "event_type": "ai_docs_usage", + "session_id": "a0350e3f-1853-4a56-be01-865cd0df1944", + "documentation": { + "entry_point": "AGENTS.md", + "files_accessed": [...], + "total_files": 5 + } + } + ] + ``` + +2. **Scan only enhancements repository**: + ``` + /metrics:ai-docs-telemetry -scan -project enhancements + ``` + +3. **Scan only machine-config-operator repository**: + ``` + /metrics:ai-docs-telemetry -scan -project machine-config-operator + ``` + +4. **Analyze a specific session**: + ``` + /metrics:ai-docs-telemetry -session ~/.claude/projects//.jsonl + ``` + +5. **Pipe to jq for analysis**: + ```bash + # Count files by entry point + /metrics:ai-docs-telemetry -scan | jq -r '.[] | "\(.documentation.entry_point): \(.documentation.total_files)"' + + # List most accessed files + /metrics:ai-docs-telemetry -scan | jq -r '.[] | .documentation.files_accessed[].path' | sort | uniq -c | sort -rn + + # Filter sessions with >5 files accessed + /metrics:ai-docs-telemetry -scan | jq '.[] | select(.documentation.total_files > 5)' + ``` + +## Arguments +- `-scan`: Scan all recent Claude Code sessions (last 7 days) +- `-project `: Filter sessions by project name (e.g., "enhancements", "machine-config-operator") +- `-session `: Analyze a specific session JSONL file + +## Related +- Session hooks: `metrics` plugin's `SessionEnd` hook +- General metrics: `send_session_metrics.py` diff --git a/plugins/metrics/scripts/ai_docs_telemetry.py b/plugins/metrics/scripts/ai_docs_telemetry.py new file mode 100755 index 000000000..cbcd36ca9 --- /dev/null +++ b/plugins/metrics/scripts/ai_docs_telemetry.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +AI Docs Telemetry Analysis Script + +Analyzes Claude Code session logs to track ai-docs usage patterns. +Parses session JSONL files to extract Read tool calls to ai-docs files. + +Usage: + ai_docs_telemetry.py -scan [-project ] + ai_docs_telemetry.py -session +""" + +import sys +import json +import os +import pathlib +import datetime +import argparse +from typing import Optional, List, Dict, Any +from dataclasses import dataclass, asdict + + +@dataclass +class FileAccess: + """Represents a single file access in the session.""" + path: str + sequence: int + time: str + + +@dataclass +class PlatformInfo: + """Platform information.""" + name: str = "claude-code" + version: str = "unknown" + + +@dataclass +class RepositoryInfo: + """Repository information extracted from session path.""" + name: str + path: str + + +@dataclass +class DocumentationInfo: + """Documentation usage information.""" + entry_point: str + files_accessed: List[Dict[str, Any]] + total_files: int + + +@dataclass +class TelemetryEvent: + """Complete telemetry event.""" + event_type: str + version: str + timestamp: str + session_id: str + platform: Dict[str, str] + repository: Dict[str, str] + documentation: Dict[str, Any] + + +def extract_repo_info(session_path: str) -> RepositoryInfo: + """ + Extract repository information from session path. + Path format: ~/.claude/projects//.jsonl + """ + parts = session_path.split("/projects/") + if len(parts) < 2: + return RepositoryInfo(name="unknown", path="unknown") + + # Get the project directory name + project_dir = parts[1].split("/")[0] + + # Decode project name (simplified - just replace dashes with slashes) + repo_name = project_dir.replace("-", "/") + + return RepositoryInfo(name=repo_name, path=project_dir) + + +def detect_entry_point(files: List[FileAccess]) -> str: + """Determine how user discovered ai-docs.""" + if not files: + return "unknown" + + first = files[0].path + if first.endswith("AGENTS.md") or first.endswith("CLAUDE.md"): + return "AGENTS.md" + if first.endswith("README.md"): + return "README.md" + + return "direct-search" + + +def process_session(session_path: str) -> Optional[TelemetryEvent]: + """ + Analyze a Claude Code session log and extract ai-docs usage. + Returns None if no ai-docs usage detected. + """ + try: + with open(session_path, 'r') as f: + content = f.read() + except Exception as e: + print(f"Error reading session: {e}", file=sys.stderr) + return None + + lines = content.split('\n') + ai_docs_files: List[FileAccess] = [] + session_id = pathlib.Path(session_path).stem + + for line in lines: + if not line.strip(): + continue + + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + + # Look for Read tool calls to ai-docs files + if event.get("type") != "assistant": + continue + + msg = event.get("message", {}) + content_arr = msg.get("content", []) + + for item in content_arr: + if not isinstance(item, dict): + continue + + if item.get("type") == "tool_use" and item.get("name") == "Read": + input_data = item.get("input", {}) + file_path = input_data.get("file_path", "") + + # Check if it's an ai-docs file or AGENTS.md + if ("ai-docs/" in file_path or + file_path.endswith("AGENTS.md") or + file_path.endswith("CLAUDE.md")): + + timestamp = event.get("timestamp", datetime.datetime.now().isoformat()) + + ai_docs_files.append(FileAccess( + path=file_path, + sequence=len(ai_docs_files) + 1, + time=timestamp + )) + + if not ai_docs_files: + return None + + # Extract repository info + repo_info = extract_repo_info(session_path) + + # Build telemetry event + event = TelemetryEvent( + event_type="ai_docs_usage", + version="1.0", + timestamp=datetime.datetime.now().isoformat(), + session_id=session_id, + platform=asdict(PlatformInfo()), + repository=asdict(repo_info), + documentation={ + "entry_point": detect_entry_point(ai_docs_files), + "files_accessed": [asdict(f) for f in ai_docs_files], + "total_files": len(ai_docs_files) + } + ) + + return event + + +def scan_recent_sessions(project_filter: Optional[str] = None) -> List[TelemetryEvent]: + """ + Scan ~/.claude/projects/ for recent sessions with ai-docs usage. + Returns list of telemetry events. + """ + home_dir = pathlib.Path.home() + projects_dir = home_dir / ".claude" / "projects" + + if not projects_dir.exists(): + print(f"Projects directory not found: {projects_dir}", file=sys.stderr) + return [] + + events = [] + processed_count = 0 + seven_days_ago = datetime.datetime.now() - datetime.timedelta(days=7) + + # Walk through all project directories + for session_file in projects_dir.glob("**/*.jsonl"): + # Skip files older than 7 days + mtime = datetime.datetime.fromtimestamp(session_file.stat().st_mtime) + if mtime < seven_days_ago: + continue + + # Filter by project if specified + if project_filter and project_filter not in str(session_file): + continue + + processed_count += 1 + + # Quick pre-filter: check if file contains ai-docs markers + try: + content = session_file.read_text() + if not ("ai-docs/" in content or "AGENTS.md" in content): + continue + except Exception: + continue + + # Process session + event = process_session(str(session_file)) + if event: + events.append(event) + + print(f"\n📊 Summary: {processed_count} sessions scanned, {len(events)} with ai-docs usage", + file=sys.stderr) + + return events + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description="Analyze Claude Code session logs for ai-docs usage" + ) + parser.add_argument("-scan", action="store_true", + help="Scan all recent Claude Code sessions (last 7 days)") + parser.add_argument("-project", type=str, + help="Filter by project name (e.g., 'enhancements', 'machine-config-operator')") + parser.add_argument("-session", type=str, + help="Analyze a specific session JSONL file") + + args = parser.parse_args() + + if args.scan: + events = scan_recent_sessions(args.project) + if events: + # Output as JSON array + print(json.dumps([asdict(e) for e in events], indent=2)) + elif args.session: + event = process_session(args.session) + if event: + print(json.dumps(asdict(event), indent=2)) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main()