From 740067084a67bc459bd4fd8e5516ee64ab829a12 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Tue, 5 May 2026 14:45:37 -0400 Subject: [PATCH 1/7] chore: add .pruneprotect and bigquery OWNERS Add pruneprotect file for automated pruning protection, and OWNERS file for the bigquery plugin. Co-Authored-By: Claude Opus 4.6 --- .pruneprotect | 18 ++++++++++++++++++ plugins/bigquery/OWNERS | 16 ++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 .pruneprotect create mode 100644 plugins/bigquery/OWNERS diff --git a/.pruneprotect b/.pruneprotect new file mode 100644 index 000000000..88f029bed --- /dev/null +++ b/.pruneprotect @@ -0,0 +1,18 @@ +# Plugins protected from automated pruning +# One path per line. Lines starting with # are comments. + +# Canonical example plugin +plugins/hello-world/ + +# Infrastructure plugins (hook-only, by design) +plugins/metrics/ +plugins/native-notifications/ + +# Marketplace operations plugin +plugins/marketplace-ops/ + +# Saved by @stbenjam on 2026-05-05 +plugins/bigquery/ +plugins/ci/ +plugins/hcp/ +plugins/sosreport/ diff --git a/plugins/bigquery/OWNERS b/plugins/bigquery/OWNERS new file mode 100644 index 000000000..326d1cda8 --- /dev/null +++ b/plugins/bigquery/OWNERS @@ -0,0 +1,16 @@ +approvers: + - dgoodwin + - neisw + - petr-muller + - stbenjam + - xueqzhan + - sosiouxme + - smg247 +reviewers: + - dgoodwin + - neisw + - petr-muller + - stbenjam + - xueqzhan + - sosiouxme + - smg247 From dea153946a456d28612b846571f04c00e0cdde2c Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Tue, 5 May 2026 14:46:44 -0400 Subject: [PATCH 2/7] chore: prune stale plugins, commands, and skills Remove 10 full plugins: - container-image, doc, etcd, golang, gwapi, node, node-tuning, origin, session, yaml Remove 12 individual commands: - git: cherry-pick-by-patch, fix-cherrypick-robot-pr, summary - olm: approve, catalog, install, opm, uninstall, upgrade - openshift: destroy-cluster - utils: auto-approve-konflux-prs, placeholder Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 60 - PLUGINS.md | 118 -- docs/data.json | 280 ---- .../.claude-plugin/plugin.json | 8 - plugins/container-image/README.md | 347 ----- plugins/container-image/commands/compare.md | 289 ---- plugins/container-image/commands/inspect.md | 315 ---- plugins/container-image/commands/tags.md | 310 ---- plugins/doc/.claude-plugin/plugin.json | 8 - plugins/doc/README.md | 18 - plugins/doc/commands/note.md | 71 - plugins/etcd/.claude-plugin/plugin.json | 8 - plugins/etcd/README.md | 170 --- plugins/etcd/commands/analyze-performance.md | 602 -------- plugins/etcd/commands/health-check.md | 460 ------ plugins/git/commands/cherry-pick-by-patch.md | 50 - .../git/commands/fix-cherrypick-robot-pr.md | 265 ---- plugins/git/commands/summary.md | 102 -- plugins/golang/.claude-plugin/plugin.json | 8 - plugins/golang/README.md | 79 - plugins/golang/commands/lint-fix.md | 106 -- plugins/golang/skills/lint/SKILL.md | 93 -- plugins/gwapi/.claude-plugin/plugin.json | 8 - plugins/gwapi/README.md | 87 -- plugins/gwapi/commands/check.md | 147 -- plugins/gwapi/commands/delete.md | 148 -- plugins/gwapi/commands/install.md | 208 --- plugins/gwapi/resources/gateway.yaml | 15 - plugins/gwapi/resources/gatewayclass.yaml | 6 - .../node-tuning/.claude-plugin/plugin.json | 8 - plugins/node-tuning/README.md | 31 - .../commands/analyze-node-tuning.md | 116 -- .../commands/generate-tuned-profile.md | 200 --- plugins/node-tuning/skills/scripts/SKILL.md | 183 --- .../skills/scripts/analyze_node_tuning.py | 1292 ----------------- .../skills/scripts/generate_tuned_profile.py | 414 ------ plugins/node/.claude-plugin/plugin.json | 8 - plugins/node/README.md | 158 -- .../commands/cluster-node-health-check.md | 760 ---------- plugins/olm/commands/approve.md | 305 ---- plugins/olm/commands/catalog.md | 433 ------ plugins/olm/commands/install.md | 272 ---- plugins/olm/commands/opm.md | 359 ----- plugins/olm/commands/uninstall.md | 392 ----- plugins/olm/commands/upgrade.md | 349 ----- plugins/openshift/commands/destroy-cluster.md | 360 ----- plugins/origin/.claude-plugin/plugin.json | 8 - plugins/origin/README.md | 124 -- .../commands/two-node-origin-pr-helper.md | 174 --- plugins/session/.claude-plugin/plugin.json | 8 - plugins/session/README.md | 20 - plugins/session/commands/save-session.md | 138 -- .../commands/auto-approve-konflux-prs.md | 143 -- plugins/utils/commands/placeholder.md | 26 - plugins/yaml/.claude-plugin/plugin.json | 9 - plugins/yaml/README.md | 18 - plugins/yaml/commands/docs.md | 168 --- 57 files changed, 10862 deletions(-) delete mode 100644 plugins/container-image/.claude-plugin/plugin.json delete mode 100644 plugins/container-image/README.md delete mode 100644 plugins/container-image/commands/compare.md delete mode 100644 plugins/container-image/commands/inspect.md delete mode 100644 plugins/container-image/commands/tags.md delete mode 100644 plugins/doc/.claude-plugin/plugin.json delete mode 100644 plugins/doc/README.md delete mode 100644 plugins/doc/commands/note.md delete mode 100644 plugins/etcd/.claude-plugin/plugin.json delete mode 100644 plugins/etcd/README.md delete mode 100644 plugins/etcd/commands/analyze-performance.md delete mode 100644 plugins/etcd/commands/health-check.md delete mode 100644 plugins/git/commands/cherry-pick-by-patch.md delete mode 100644 plugins/git/commands/fix-cherrypick-robot-pr.md delete mode 100644 plugins/git/commands/summary.md delete mode 100644 plugins/golang/.claude-plugin/plugin.json delete mode 100644 plugins/golang/README.md delete mode 100644 plugins/golang/commands/lint-fix.md delete mode 100644 plugins/golang/skills/lint/SKILL.md delete mode 100644 plugins/gwapi/.claude-plugin/plugin.json delete mode 100644 plugins/gwapi/README.md delete mode 100644 plugins/gwapi/commands/check.md delete mode 100644 plugins/gwapi/commands/delete.md delete mode 100644 plugins/gwapi/commands/install.md delete mode 100644 plugins/gwapi/resources/gateway.yaml delete mode 100644 plugins/gwapi/resources/gatewayclass.yaml delete mode 100644 plugins/node-tuning/.claude-plugin/plugin.json delete mode 100644 plugins/node-tuning/README.md delete mode 100644 plugins/node-tuning/commands/analyze-node-tuning.md delete mode 100644 plugins/node-tuning/commands/generate-tuned-profile.md delete mode 100644 plugins/node-tuning/skills/scripts/SKILL.md delete mode 100644 plugins/node-tuning/skills/scripts/analyze_node_tuning.py delete mode 100644 plugins/node-tuning/skills/scripts/generate_tuned_profile.py delete mode 100644 plugins/node/.claude-plugin/plugin.json delete mode 100644 plugins/node/README.md delete mode 100644 plugins/node/commands/cluster-node-health-check.md delete mode 100644 plugins/olm/commands/approve.md delete mode 100644 plugins/olm/commands/catalog.md delete mode 100644 plugins/olm/commands/install.md delete mode 100644 plugins/olm/commands/opm.md delete mode 100644 plugins/olm/commands/uninstall.md delete mode 100644 plugins/olm/commands/upgrade.md delete mode 100644 plugins/openshift/commands/destroy-cluster.md delete mode 100644 plugins/origin/.claude-plugin/plugin.json delete mode 100644 plugins/origin/README.md delete mode 100644 plugins/origin/commands/two-node-origin-pr-helper.md delete mode 100644 plugins/session/.claude-plugin/plugin.json delete mode 100644 plugins/session/README.md delete mode 100644 plugins/session/commands/save-session.md delete mode 100644 plugins/utils/commands/auto-approve-konflux-prs.md delete mode 100644 plugins/utils/commands/placeholder.md delete mode 100644 plugins/yaml/.claude-plugin/plugin.json delete mode 100644 plugins/yaml/README.md delete mode 100644 plugins/yaml/commands/docs.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 953664493..5e4f1d68a 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -34,18 +34,6 @@ "description": "Team structure knowledge and health analysis commands for OpenShift teams", "version": "0.0.13" }, - { - "name": "doc", - "source": "./plugins/doc", - "description": "A plugin for engineering documentation and notes", - "version": "0.0.2" - }, - { - "name": "session", - "source": "./plugins/session", - "description": "A plugin for Claude session management and persistence", - "version": "0.0.2" - }, { "name": "snowflake", "source": "./plugins/snowflake", @@ -100,18 +88,6 @@ "description": "Implementation requirements and details for OpenShift TLS security profiles", "version": "0.0.1" }, - { - "name": "etcd", - "source": "./plugins/etcd", - "description": "Etcd cluster health monitoring and performance analysis utilities", - "version": "0.0.2" - }, - { - "name": "yaml", - "source": "./plugins/yaml", - "description": "YAML documentation and utilities", - "version": "0.0.2" - }, { "name": "must-gather", "source": "./plugins/must-gather", @@ -154,36 +130,12 @@ "description": "Analyze test coverage and identify gaps in test scenarios", "version": "0.0.2" }, - { - "name": "node-tuning", - "source": "./plugins/node-tuning", - "description": "Generate and analyze OpenShift node tuning profiles", - "version": "1.0.0" - }, { "name": "testing", "source": "./plugins/testing", "description": "Comprehensive testing utilities for operators and applications", "version": "0.1.0" }, - { - "name": "origin", - "source": "./plugins/origin", - "description": "Helpers for openshift/origin development.", - "version": "0.0.2" - }, - { - "name": "container-image", - "source": "./plugins/container-image", - "description": "Container image inspection and analysis using skopeo and podman", - "version": "0.0.2" - }, - { - "name": "node", - "source": "./plugins/node", - "description": "Kubernetes and OpenShift node health monitoring and diagnostics", - "version": "0.0.2" - }, { "name": "bigquery", "source": "./plugins/bigquery", @@ -196,24 +148,12 @@ "description": "Manage isolated git worktree workspaces for multi-repo development", "version": "1.0.1" }, - { - "name": "gwapi", - "source": "./plugins/gwapi", - "description": "Gateway API installation utilities for Kubernetes/OpenShift clusters", - "version": "0.0.1" - }, { "name": "code-review", "source": "./plugins/code-review", "description": "Automated code quality review with language-aware analysis for pre-commit verification", "version": "0.0.7" }, - { - "name": "golang", - "source": "./plugins/golang", - "description": "Run golang codebase related commands and tools", - "version": "0.1.0" - }, { "name": "ote-migration", "source": "./plugins/ote-migration", diff --git a/PLUGINS.md b/PLUGINS.md index fe4379aa5..b5464b582 100644 --- a/PLUGINS.md +++ b/PLUGINS.md @@ -8,26 +8,17 @@ This document lists all available Claude Code plugins and their commands in the - [Ci](#ci-plugin) - [Code Review](#code-review-plugin) - [Compliance](#compliance-plugin) -- [Container Image](#container-image-plugin) -- [Doc](#doc-plugin) -- [Etcd](#etcd-plugin) - [Git](#git-plugin) -- [Golang](#golang-plugin) -- [Gwapi](#gwapi-plugin) - [Hcp](#hcp-plugin) - [Hello World](#hello-world-plugin) - [Jira](#jira-plugin) - [Lvms](#lvms-plugin) - [Must Gather](#must-gather-plugin) -- [Node](#node-plugin) -- [Node Tuning](#node-tuning-plugin) - [Olm](#olm-plugin) - [Olm Team](#olm-team-plugin) - [Openshift](#openshift-plugin) - [Openshift Tls Profile](#openshift-tls-profile-plugin) -- [Origin](#origin-plugin) - [Ote Migration](#ote-migration-plugin) -- [Session](#session-plugin) - [Snowflake](#snowflake-plugin) - [Sosreport](#sosreport-plugin) - [Teams](#teams-plugin) @@ -35,7 +26,6 @@ This document lists all available Claude Code plugins and their commands in the - [Testing](#testing-plugin) - [Utils](#utils-plugin) - [Workspaces](#workspaces-plugin) -- [Yaml](#yaml-plugin) ### Agendas Plugin @@ -114,36 +104,6 @@ Security compliance and vulnerability analysis tools for Go projects See [plugins/compliance/README.md](plugins/compliance/README.md) for detailed documentation. -### Container Image Plugin - -Container image inspection and analysis using skopeo and podman - -**Commands:** -- **`/container-image:compare` ` `** - Compare two container images to identify differences -- **`/container-image:inspect` ``** - Inspect and provide detailed breakdown of a container image -- **`/container-image:tags` ``** - List and analyze available tags for a container image repository - -See [plugins/container-image/README.md](plugins/container-image/README.md) for detailed documentation. - -### Doc Plugin - -A plugin for engineering documentation and notes - -**Commands:** -- **`/doc:note` `[task description]`** - Generate professional engineering notes and append them to a log file - -See [plugins/doc/README.md](plugins/doc/README.md) for detailed documentation. - -### Etcd Plugin - -Etcd cluster health monitoring and performance analysis utilities - -**Commands:** -- **`/etcd:analyze-performance` `[--duration ]`** - Analyze etcd performance metrics, latency, and identify bottlenecks -- **`/etcd:health-check` `[--verbose]`** - Check etcd cluster health, member status, and identify issues - -See [plugins/etcd/README.md](plugins/etcd/README.md) for detailed documentation. - ### Git Plugin Git workflow automation and utilities @@ -152,36 +112,13 @@ Git workflow automation and utilities - **`/git:backport` ` [branch2...] [--new-branch]`** - Backport commits to multiple branches - **`/git:bisect` `[good-commit] [bad-commit]`** - Interactive git bisect assistant with pattern detection and automation - **`/git:branch-cleanup` `[--dry-run] [--merged-only] [--remote]`** - Clean up old and defunct branches that are no longer needed -- **`/git:cherry-pick-by-patch` ``** - Cherry-pick git commit into current branch by "patch" command - **`/git:commit-suggest` `[N]`** - Generate Conventional Commits style commit messages or summarize existing commits - **`/git:debt-scan`** - Analyze technical debt indicators in the repository -- **`/git:fix-cherrypick-robot-pr` ` [error-messages]`** - Fix a cherrypick-robot PR that needs manual intervention - **`/git:redescribe` `[pr-url]`** - Adapt and correct a PR description to match its code diffs and commit messages - **`/git:suggest-reviewers` `[base-branch]`** - Suggest appropriate reviewers for a PR based on git blame and OWNERS files -- **`/git:summary`** - Show current branch, git status, and recent commits for quick context See [plugins/git/README.md](plugins/git/README.md) for detailed documentation. -### Golang Plugin - -Run golang codebase related commands and tools - -**Commands:** -- **`/golang:lint-fix`** - Run golangci-lint tool and fix all reported issues - -See [plugins/golang/README.md](plugins/golang/README.md) for detailed documentation. - -### Gwapi Plugin - -Gateway API management for Kubernetes/OpenShift clusters - -**Commands:** -- **`/gwapi:check` `[namespace]`** - Check Gateway API resources status in the cluster -- **`/gwapi:delete` `[namespace]`** - Delete Gateway API resources from a Kubernetes/OpenShift cluster -- **`/gwapi:install` `[namespace]`** - Install Gateway API resources to a Kubernetes/OpenShift cluster - -See [plugins/gwapi/README.md](plugins/gwapi/README.md) for detailed documentation. - ### Hcp Plugin Generate HyperShift cluster creation commands via hcp CLI from natural language descriptions @@ -247,41 +184,16 @@ A plugin to analyze and report on must-gather data See [plugins/must-gather/README.md](plugins/must-gather/README.md) for detailed documentation. -### Node Plugin - -Kubernetes and OpenShift node health monitoring and diagnostics - -**Commands:** -- **`/node:cluster-node-health-check` `[--node ] [--verbose] [--output-format json|text]`** - Perform comprehensive health check on cluster nodes and report kubelet, CRI-O, and node-level issues - -See [plugins/node/README.md](plugins/node/README.md) for detailed documentation. - -### Node Tuning Plugin - -Automatically create and apply tuned profile - -**Commands:** -- **`/node-tuning:analyze-node-tuning` `[--sosreport PATH] [--format json|markdown] [--max-irq-samples N]`** - Analyze kernel/sysctl tuning from a live node or sosreport snapshot and propose NTO recommendations -- **`/node-tuning:generate-tuned-profile` `[profile-name] [--summary ...] [--sysctl ...] [options]`** - Generate a Tuned (tuned.openshift.io/v1) profile manifest for the Node Tuning Operator - -See [plugins/node-tuning/README.md](plugins/node-tuning/README.md) for detailed documentation. - ### Olm Plugin OLM (Operator Lifecycle Manager) plugin for operator management and debugging **Commands:** -- **`/olm:approve` ` [namespace] [--all]`** - Approve pending InstallPlans for operator installations and upgrades -- **`/olm:catalog` ` [arguments]`** - Manage catalog sources for discovering and installing operators - **`/olm:debug` ` [olm-version]`** - Debug OLM issues using must-gather logs and source code analysis - **`/olm:diagnose` `[operator-name] [namespace] [--fix] [--cluster]`** - Diagnose and optionally fix common OLM and operator issues -- **`/olm:install` ` [namespace] [channel] [source] [--approval=Automatic|Manual]`** - Install a day-2 operator using Operator Lifecycle Manager - **`/olm:list` `[namespace] [--all-namespaces]`** - List installed operators in the cluster -- **`/olm:opm` ` [arguments...]`** - Execute opm (Operator Package Manager) commands for building and managing operator catalogs - **`/olm:search` `[query] [--catalog ]`** - Search for available operators in catalog sources - **`/olm:status` ` [namespace]`** - Get detailed status and health information for an operator -- **`/olm:uninstall` ` [namespace] [--remove-crds] [--remove-namespace]`** - Uninstall a day-2 operator and optionally remove its resources -- **`/olm:upgrade` ` [namespace] [--channel=] [--approve]`** - Update an operator to the latest version or switch channels See [plugins/olm/README.md](plugins/olm/README.md) for detailed documentation. @@ -307,7 +219,6 @@ OpenShift development utilities and helpers - **`/openshift:cluster-health-check` `[--verbose] [--output-format]`** - Perform comprehensive health check on OpenShift cluster and report issues - **`/openshift:crd-review` `[repository-path]`** - Review Kubernetes CRDs against Kubernetes and OpenShift API conventions - **`/openshift:create-cluster` `[release-image] [platform] [options]`** - Extract OpenShift installer from release image and create an OCP cluster -- **`/openshift:destroy-cluster` `[install-dir]`** - Destroy an OpenShift cluster created by create-cluster command - **`/openshift:expand-test-case` `[test-idea-or-file-or-commands] [format]`** - Expand basic test ideas or existing oc commands into comprehensive test scenarios with edge cases in oc CLI or Ginkgo format - **`/openshift:ironic-status`** - Check status of Ironic baremetal nodes in OpenShift cluster - **`/openshift:new-e2e-test` `[test-specification]`** - Write and validate new OpenShift E2E tests using Ginkgo framework @@ -330,15 +241,6 @@ Implementation requirements and details for OpenShift TLS security profiles See [plugins/openshift-tls-profile/README.md](plugins/openshift-tls-profile/README.md) for detailed documentation. -### Origin Plugin - -Helpers for openshift/origin development. - -**Commands:** -- **`/origin:two-node-origin-pr-helper` `[--url PR_URL] [] [--depth quick|full]`** - Expert review tool for PRs that add or modify Two Node (Fencing or Arbiter) tests under test/extended/two_node/ in openshift/origin. - -See [plugins/origin/README.md](plugins/origin/README.md) for detailed documentation. - ### Ote Migration Plugin Automate OpenShift Tests Extension (OTE) migration for component repositories @@ -348,15 +250,6 @@ Automate OpenShift Tests Extension (OTE) migration for component repositories See [plugins/ote-migration/README.md](plugins/ote-migration/README.md) for detailed documentation. -### Session Plugin - -A plugin to save and resume conversation sessions across long time intervals - -**Commands:** -- **`/session:save-session` `[optional-description]`** - Save current conversation session to markdown file for future continuation - -See [plugins/session/README.md](plugins/session/README.md) for detailed documentation. - ### Snowflake Plugin Snowflake data analysis commands for engineering metrics and reports @@ -419,11 +312,9 @@ A generic utilities plugin serving as a catch-all for various helper commands an **Commands:** - **`/utils:address-reviews` `[PR number (optional - uses current branch if omitted)] [--preview]`** - Fetch and address all PR review comments -- **`/utils:auto-approve-konflux-prs` ``** - Automate approving Konflux bot PRs for the given repository by adding /lgtm and /approve - **`/utils:find-konflux-images` ``** - Find and verify Konflux-built container images from a GitHub PR - **`/utils:generate-test-plan` `[GitHub PR URLs]`** - Generate test steps for one or more related PRs - **`/utils:gh-attention` `[--repo ]`** - List PRs and issues requiring your attention -- **`/utils:placeholder`** - Placeholder command for the utils plugin - **`/utils:process-renovate-pr` ` [JIRA_PROJECT] [COMPONENT]`** - Process Renovate dependency PR(s) to meet repository contribution standards - **`/utils:review-ai-helpers-overlap` `[--idea TEXT] [--pr NUMBER] [--verbose]`** - Review potential overlaps with existing ai-helpers (Claude Code Plugins, Commands, Skills, Sub-agents, or Hooks) and open PRs - **`/utils:review-security` `[file-paths-or-patterns]`** - Orchestrate security scanners and provide contextual triage of findings @@ -439,12 +330,3 @@ Manage isolated git worktree workspaces for multi-repo development - **`/workspaces:delete` ``** - Delete a workspace and its git worktrees See [plugins/workspaces/README.md](plugins/workspaces/README.md) for detailed documentation. - -### Yaml Plugin - -Generate comprehensive YAML documentation from Go struct definitions with sensible default values - -**Commands:** -- **`/yaml:docs` `[file:StructName] [output.md]`** - Generate comprehensive YAML documentation from Go struct definitions with sensible default values - -See [plugins/yaml/README.md](plugins/yaml/README.md) for detailed documentation. diff --git a/docs/data.json b/docs/data.json index fcde4773e..217901c87 100644 --- a/docs/data.json +++ b/docs/data.json @@ -22,12 +22,6 @@ "name": "branch-cleanup", "synopsis": "/git:branch-cleanup [--dry-run] [--merged-only] [--remote]" }, - { - "argument_hint": "", - "description": "Cherry-pick git commit into current branch by \"patch\" command", - "name": "cherry-pick-by-patch", - "synopsis": "/git:cherry-pick-by-patch commit_hash" - }, { "argument_hint": "[N]", "description": "Generate Conventional Commits style commit messages or summarize existing commits", @@ -40,12 +34,6 @@ "name": "debt-scan", "synopsis": "/git:debt-scan" }, - { - "argument_hint": " [error-messages]", - "description": "Fix a cherrypick-robot PR that needs manual intervention", - "name": "fix-cherrypick-robot-pr", - "synopsis": "/git:fix-cherrypick-robot-pr [error-messages]" - }, { "argument_hint": "[pr-url]", "description": "Adapt and correct a PR description to match its code diffs and commit messages", @@ -57,12 +45,6 @@ "description": "Suggest appropriate reviewers for a PR based on git blame and OWNERS files", "name": "suggest-reviewers", "synopsis": "/git:suggest-reviewers [base-branch]" - }, - { - "argument_hint": "", - "description": "Show current branch, git status, and recent commits for quick context", - "name": "summary", - "synopsis": "/git:summary" } ], "description": "Git Plugin", @@ -745,38 +727,6 @@ ], "version": "0.0.13" }, - { - "commands": [ - { - "argument_hint": "[task description]", - "description": "Generate professional engineering notes and append them to a log file", - "name": "note", - "synopsis": "/doc:note [task description]" - } - ], - "description": "A plugin for engineering documentation and notes", - "has_readme": true, - "hooks": [], - "name": "doc", - "skills": [], - "version": "0.0.2" - }, - { - "commands": [ - { - "argument_hint": "[optional-description]", - "description": "Save current conversation session to markdown file for future continuation", - "name": "save-session", - "synopsis": "/save-session" - } - ], - "description": "A plugin for Claude session management and persistence", - "has_readme": true, - "hooks": [], - "name": "session", - "skills": [], - "version": "0.0.2" - }, { "commands": [ { @@ -855,12 +805,6 @@ "name": "address-reviews", "synopsis": "" }, - { - "argument_hint": "", - "description": "Automate approving Konflux bot PRs for the given repository by adding /lgtm and /approve", - "name": "auto-approve-konflux-prs", - "synopsis": "" - }, { "argument_hint": "", "description": "Find and verify Konflux-built container images from a GitHub PR", @@ -879,12 +823,6 @@ "name": "gh-attention", "synopsis": "/utils:gh-attention [--repo ]" }, - { - "argument_hint": "", - "description": "Placeholder command for the utils plugin", - "name": "placeholder", - "synopsis": "/utils:placeholder" - }, { "argument_hint": " [JIRA_PROJECT] [COMPONENT]", "description": "Process Renovate dependency PR(s) to meet repository contribution standards", @@ -913,18 +851,6 @@ }, { "commands": [ - { - "argument_hint": " [namespace] [--all]", - "description": "Approve pending InstallPlans for operator installations and upgrades", - "name": "approve", - "synopsis": "/olm:approve [namespace] [--all]" - }, - { - "argument_hint": " [arguments]", - "description": "Manage catalog sources for discovering and installing operators", - "name": "catalog", - "synopsis": "/olm:catalog list" - }, { "argument_hint": " [olm-version]", "description": "Debug OLM issues using must-gather logs and source code analysis", @@ -937,24 +863,12 @@ "name": "diagnose", "synopsis": "/olm:diagnose [operator-name] [namespace] [--fix] [--cluster]" }, - { - "argument_hint": " [namespace] [channel] [source] [--approval=Automatic|Manual]", - "description": "Install a day-2 operator using Operator Lifecycle Manager", - "name": "install", - "synopsis": "/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual]" - }, { "argument_hint": "[namespace] [--all-namespaces]", "description": "List installed operators in the cluster", "name": "list", "synopsis": "/olm:list [namespace] [--all-namespaces]" }, - { - "argument_hint": " [arguments...]", - "description": "Execute opm (Operator Package Manager) commands for building and managing operator catalogs", - "name": "opm", - "synopsis": "/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=]" - }, { "argument_hint": "[query] [--catalog ]", "description": "Search for available operators in catalog sources", @@ -966,18 +880,6 @@ "description": "Get detailed status and health information for an operator", "name": "status", "synopsis": "/olm:status [namespace]" - }, - { - "argument_hint": " [namespace] [--remove-crds] [--remove-namespace]", - "description": "Uninstall a day-2 operator and optionally remove its resources", - "name": "uninstall", - "synopsis": "/olm:uninstall [namespace] [--remove-crds] [--remove-namespace]" - }, - { - "argument_hint": " [namespace] [--channel=] [--approve]", - "description": "Update an operator to the latest version or switch channels", - "name": "upgrade", - "synopsis": "/olm:upgrade [namespace] [--channel=] [--approve]" } ], "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", @@ -1097,12 +999,6 @@ "name": "create-cluster", "synopsis": "/openshift:create-cluster [release-image] [platform] [options]" }, - { - "argument_hint": "[install-dir]", - "description": "Destroy an OpenShift cluster created by create-cluster command", - "name": "destroy-cluster", - "synopsis": "/openshift:destroy-cluster [install-dir]" - }, { "argument_hint": "[test-idea-or-file-or-commands] [format]", "description": "Expand basic test ideas or existing oc commands into comprehensive test scenarios with edge cases in oc CLI or Ginkgo format", @@ -1204,44 +1100,6 @@ ], "version": "0.0.1" }, - { - "commands": [ - { - "argument_hint": "[--duration ]", - "description": "Analyze etcd performance metrics, latency, and identify bottlenecks", - "name": "analyze-performance", - "synopsis": "/etcd:analyze-performance [--duration ]" - }, - { - "argument_hint": "[--verbose]", - "description": "Check etcd cluster health, member status, and identify issues", - "name": "health-check", - "synopsis": "/etcd:health-check [--verbose]" - } - ], - "description": "Etcd cluster health monitoring and performance analysis utilities", - "has_readme": true, - "hooks": [], - "name": "etcd", - "skills": [], - "version": "0.0.2" - }, - { - "commands": [ - { - "argument_hint": "[file:StructName] [output.md]", - "description": "Generate comprehensive YAML documentation from Go struct definitions with sensible default values", - "name": "docs", - "synopsis": "/yaml:docs [file:StructName] [output.md]" - } - ], - "description": "YAML documentation and utilities", - "has_readme": true, - "hooks": [], - "name": "yaml", - "skills": [], - "version": "0.0.2" - }, { "commands": [ { @@ -1466,34 +1324,6 @@ ], "version": "0.0.2" }, - { - "commands": [ - { - "argument_hint": "[--sosreport PATH] [--format json|markdown] [--max-irq-samples N]", - "description": "Analyze kernel/sysctl tuning from a live node or sosreport snapshot and propose NTO recommendations", - "name": "analyze-node-tuning", - "synopsis": "/node-tuning:analyze-node-tuning [--sosreport PATH] [--collect-sosreport|--no-collect-sosreport] [--sosreport-output PATH] [--node NODE] [--kubeconfig PATH] [--oc-binary PATH] [--format json|markdown] [--max-irq-samples N] [--keep-snapshot]" - }, - { - "argument_hint": "[profile-name] [--summary ...] [--sysctl ...] [options]", - "description": "Generate a Tuned (tuned.openshift.io/v1) profile manifest for the Node Tuning Operator", - "name": "generate-tuned-profile", - "synopsis": "/node-tuning:generate-tuned-profile [profile-name] [--summary TEXT] [--include VALUE ...] [--sysctl KEY=VALUE ...] [--match-label KEY[=VALUE] ...] [options]" - } - ], - "description": "Generate and analyze OpenShift node tuning profiles", - "has_readme": true, - "hooks": [], - "name": "node-tuning", - "skills": [ - { - "description": "Generate tuned manifests and evaluate node tuning snapshots", - "id": "scripts", - "name": "Node Tuning Helper Scripts" - } - ], - "version": "1.0.0" - }, { "commands": [ { @@ -1521,66 +1351,6 @@ ], "version": "0.1.0" }, - { - "commands": [ - { - "argument_hint": "[--url PR_URL] [] [--depth quick|full]", - "description": "Expert review tool for PRs that add or modify Two Node (Fencing or Arbiter) tests under test/extended/two_node/ in openshift/origin.", - "name": "two-node-origin-pr-helper", - "synopsis": "/origin:two-node-origin-pr-helper [--url PR_URL] [] [--depth quick|full]" - } - ], - "description": "Helpers for openshift/origin development.", - "has_readme": true, - "hooks": [], - "name": "origin", - "skills": [], - "version": "0.0.2" - }, - { - "commands": [ - { - "argument_hint": " ", - "description": "Compare two container images to identify differences", - "name": "compare", - "synopsis": "/container-image:compare " - }, - { - "argument_hint": "", - "description": "Inspect and provide detailed breakdown of a container image", - "name": "inspect", - "synopsis": "/container-image:inspect " - }, - { - "argument_hint": "", - "description": "List and analyze available tags for a container image repository", - "name": "tags", - "synopsis": "/container-image:tags " - } - ], - "description": "Container image inspection and analysis using skopeo and podman", - "has_readme": true, - "hooks": [], - "name": "container-image", - "skills": [], - "version": "0.0.2" - }, - { - "commands": [ - { - "argument_hint": "[--node ] [--verbose] [--output-format json|text]", - "description": "Perform comprehensive health check on cluster nodes and report kubelet, CRI-O, and node-level issues", - "name": "cluster-node-health-check", - "synopsis": "/node:cluster-node-health-check [--node ] [--verbose] [--output-format json|text]" - } - ], - "description": "Kubernetes and OpenShift node health monitoring and diagnostics", - "has_readme": true, - "hooks": [], - "name": "node", - "skills": [], - "version": "0.0.2" - }, { "commands": [ { @@ -1625,34 +1395,6 @@ "skills": [], "version": "1.0.1" }, - { - "commands": [ - { - "argument_hint": "[namespace]", - "description": "Check Gateway API resources status in the cluster", - "name": "check", - "synopsis": "/gwapi:check [namespace]" - }, - { - "argument_hint": "[namespace]", - "description": "Delete Gateway API resources from a Kubernetes/OpenShift cluster", - "name": "delete", - "synopsis": "/gwapi:delete [namespace]" - }, - { - "argument_hint": "[namespace]", - "description": "Install Gateway API resources to a Kubernetes/OpenShift cluster", - "name": "install", - "synopsis": "/gwapi:install [namespace]" - } - ], - "description": "Gateway API installation utilities for Kubernetes/OpenShift clusters", - "has_readme": true, - "hooks": [], - "name": "gwapi", - "skills": [], - "version": "0.0.1" - }, { "commands": [ { @@ -1691,28 +1433,6 @@ ], "version": "0.0.7" }, - { - "commands": [ - { - "argument_hint": "", - "description": "Run golangci-lint tool and fix all reported issues", - "name": "lint-fix", - "synopsis": "/golang:lint-fix" - } - ], - "description": "Run golang codebase related commands and tools", - "has_readme": true, - "hooks": [], - "name": "golang", - "skills": [ - { - "description": "Detect and run golangci-lint in a Go repository using the best available method", - "id": "lint", - "name": "Go Lint" - } - ], - "version": "0.1.0" - }, { "commands": [ { diff --git a/plugins/container-image/.claude-plugin/plugin.json b/plugins/container-image/.claude-plugin/plugin.json deleted file mode 100644 index bc1a49d7d..000000000 --- a/plugins/container-image/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "container-image", - "description": "Container image inspection and analysis using skopeo and podman", - "version": "0.0.2", - "author": { - "name": "github.com/openshift-eng" - } -} diff --git a/plugins/container-image/README.md b/plugins/container-image/README.md deleted file mode 100644 index c311b3177..000000000 --- a/plugins/container-image/README.md +++ /dev/null @@ -1,347 +0,0 @@ -# Container Image Plugin - -Container image inspection and analysis tools using skopeo and podman. - -## Overview - -This plugin provides commands to inspect, analyze, and compare container images from any OCI-compliant registry. It leverages `skopeo` and `podman` to provide detailed insights into image structure, manifest lists, layers, and configuration without requiring full image pulls. - -## Features - -- **Image Inspection**: Detailed breakdown of image metadata, layers, and configuration -- **Image Comparison**: Compare two images to identify differences -- **Tag Discovery**: List and analyze available tags for a repository - -## Commands - -### `/container-image:inspect` - -Inspect and provide detailed breakdown of a container image. - -**Usage:** -```bash -/container-image:inspect -``` - -**Examples:** -```bash -/container-image:inspect quay.io/openshift-release-dev/ocp-release:4.20.0-multi -/container-image:inspect registry.redhat.io/ubi9/ubi:latest -/container-image:inspect docker.io/library/nginx@sha256:abc123... -``` - -**What it shows:** -- Inferred image purpose and context based on metadata analysis -- Image digest and basic metadata -- Architecture and OS information -- Manifest type (single image vs manifest list) -- For multi-arch images: all available platforms with per-platform digests, sizes, and layer counts -- Platform comparison (size ranges, architecture list) -- Size breakdown and layer details -- Configuration (environment, entrypoint, ports, volumes) -- Labels and annotations -- Usage examples for pulling specific platforms - -See [commands/inspect.md](commands/inspect.md) for full documentation. - -### `/container-image:compare` - -Compare two container images to identify differences. - -**Usage:** -```bash -/container-image:compare -``` - -**Examples:** -```bash -/container-image:compare quay.io/myapp:v1.0.0 quay.io/myapp:v2.0.0 -/container-image:compare registry.prod.example.com/myapp:latest registry.staging.example.com/myapp:latest -``` - -**What it shows:** -- Whether images are identical (digest match) -- Metadata differences (creation date, size) -- Layer analysis (added, removed, modified layers) -- Configuration changes (environment variables, labels, entrypoint) -- Size impact -- Summary of significant changes - -See [commands/compare.md](commands/compare.md) for full documentation. - -### `/container-image:tags` - -List and analyze available tags for a container image repository. - -**Usage:** -```bash -/container-image:tags -``` - -**Examples:** -```bash -/container-image:tags quay.io/openshift-release-dev/ocp-release -/container-image:tags docker.io/library/nginx -``` - -**What it shows:** -- All available tags for the repository -- Tag metadata (creation date, size, architecture) -- Tag categorization (version, date-based, special tags) -- Recent tags and update patterns -- Recommendations for tag selection -- Duplicate tags (same digest, different names) - -See [commands/tags.md](commands/tags.md) for full documentation. - -## Installation - -### From the Claude Code Plugin Marketplace - -1. **Add the marketplace** (if not already added): - ```bash - /plugin marketplace add openshift-eng/ai-helpers - ``` - -2. **Install the container-image plugin**: - ```bash - /plugin install container-image@ai-helpers - ``` - -3. **Use the commands**: - ```bash - /container-image:inspect quay.io/openshift-release-dev/ocp-release:4.20.0-multi - ``` - -## Prerequisites - -### Required Tools - -**skopeo** - Primary tool for image inspection - -- Check if installed: `which skopeo` -- Installation: - - RHEL/Fedora: `sudo dnf install skopeo` - - Ubuntu/Debian: `sudo apt-get install skopeo` - - macOS: `brew install skopeo` -- Documentation: https://github.com/containers/skopeo - -### Optional Tools - -**podman** - Additional image analysis capabilities - -- Installation: - - RHEL/Fedora: `sudo dnf install podman` - - Ubuntu/Debian: `sudo apt-get install podman` - - macOS: `brew install podman` -- Documentation: https://podman.io/ - -**dive** - Interactive layer analysis (for `/container-image:compare`) - -- Installation: https://github.com/wagoodman/dive -- Provides detailed layer-by-layer exploration - -### Registry Authentication - -For private registries, authenticate before running commands: - -```bash -# Using skopeo -skopeo login registry.example.com - -# Using podman (if installed) -podman login registry.example.com -``` - -Authentication is typically stored at `~/.docker/config.json` or `${XDG_RUNTIME_DIR}/containers/auth.json`. - -## Use Cases - -### Development Workflows - -1. **Version Selection**: Find the right image version for your deployment - ```bash - /container-image:tags quay.io/myapp - /container-image:inspect quay.io/myapp:v2.1.0 - ``` - -2. **Multi-Arch Development**: Verify architecture support before deployment - ```bash - /container-image:inspect registry.redhat.io/ubi9/ubi:latest - ``` - The inspect command automatically detects and shows all available platforms for multi-arch images. - -3. **Update Analysis**: Understand changes before upgrading - ```bash - /container-image:compare myapp:current myapp:latest - ``` - -### Troubleshooting - -1. **Deployment Issues**: Verify correct image is being used - ```bash - /container-image:inspect - ``` - -2. **Architecture Mismatches**: Check platform compatibility - ```bash - /container-image:inspect - ``` - For multi-arch images, this will show all available platforms and their digests. - -3. **Size Issues**: Identify what's consuming space - ```bash - /container-image:inspect - /container-image:compare - ``` - -### Security & Compliance - -1. **Image Verification**: Confirm image authenticity via digest - ```bash - /container-image:inspect myapp@sha256:abc123... - ``` - -2. **Change Tracking**: Document what changed between versions - ```bash - /container-image:compare prod:v1.0.0 prod:v1.1.0 - ``` - -3. **Registry Migration**: Verify images copied correctly - ```bash - /container-image:compare source.registry.com/app:v1 dest.registry.com/app:v1 - ``` - -## Common Workflows - -### Upgrading an Application Image - -```bash -# 1. List available versions -/container-image:tags quay.io/myapp - -# 2. Inspect the new version (shows all architectures if multi-arch) -/container-image:inspect quay.io/myapp:v2.0.0 - -# 3. Compare with current version -/container-image:compare quay.io/myapp:v1.5.0 quay.io/myapp:v2.0.0 -``` - -### Verifying Multi-Architecture Support - -```bash -# 1. Check if image is multi-arch and see all platforms -/container-image:inspect quay.io/myapp:latest - -# 2. Inspect specific platform by digest -/container-image:inspect quay.io/myapp@sha256: - -# 3. Compare platforms -/container-image:compare quay.io/myapp@sha256: quay.io/myapp@sha256: -``` - -### Investigating Image Bloat - -```bash -# 1. Inspect current image -/container-image:inspect myapp:latest - -# 2. Compare with previous version -/container-image:compare myapp:v1.0.0 myapp:latest - -# 3. Identify which layers added size -# (Layer analysis in the comparison output) -``` - -## Tips & Best Practices - -### Image References - -- **Use digests for production**: `myapp@sha256:abc123...` (immutable) -- **Use tags for development**: `myapp:latest` (convenient but mutable) -- **Be specific**: `myapp:v1.2.3` is better than `myapp:v1` - -### Multi-Architecture Images - -- Use `/container-image:inspect` to check platform support - it automatically detects and displays all available architectures -- Pull specific platforms when needed: `podman pull --platform=linux/arm64 ` -- Verify all platforms are updated in manifest lists by comparing platform digests - -### Performance - -- `skopeo inspect` doesn't pull the full image (fast and efficient) -- For large repositories, `/container-image:tags` may sample tags -- Use `--filter` options to narrow results for large tag lists - -### Security - -- Always verify image digests match expectations -- Check for unexpected configuration changes with `/container-image:compare` -- Use `/container-image:inspect` to review labels and metadata - -## Plugin Structure - -``` -plugins/container-image/ -├── .claude-plugin/ -│ └── plugin.json # Plugin metadata -├── commands/ -│ ├── inspect.md # Image inspection command -│ ├── compare.md # Image comparison command -│ └── tags.md # Tag listing command -└── README.md # This file -``` - -## Development - -### Adding New Commands - -To add a new command to this plugin: - -1. Create a new markdown file in `commands/`: - ```bash - touch plugins/container-image/commands/your-command.md - ``` - -2. Follow the structure from existing commands (see `commands/inspect.md`) - -3. Include these sections: - - Name - - Synopsis - - Description - - Prerequisites - - Implementation - - Return Value - - Examples - - Error Handling - - Notes - - Arguments - -4. Test your command: - ```bash - /container-image:your-command - ``` - -### Testing - -Test commands with various image types: -- Public images (docker.io, quay.io) -- Private registries (requires authentication) -- Multi-arch images (manifest lists) -- Single-arch images -- Large images (layer analysis) -- Different registries (Red Hat, Quay, Docker Hub) - -## Contributing - -Contributions are welcome! When adding new container image analysis commands: - -1. Ensure the command provides unique value not covered by existing commands -2. Follow the existing command structure and documentation format -3. Include comprehensive examples and error handling -4. Test with multiple registries and image types -5. Update this README with new command documentation - -## License - -See [LICENSE](../../LICENSE) for details. diff --git a/plugins/container-image/commands/compare.md b/plugins/container-image/commands/compare.md deleted file mode 100644 index 0cf29d862..000000000 --- a/plugins/container-image/commands/compare.md +++ /dev/null @@ -1,289 +0,0 @@ ---- -description: Compare two container images to identify differences -argument-hint: ---- - -## Name -container-image:compare - -## Synopsis -``` -/container-image:compare -``` - -## Description - -The `container-image:compare` command compares two container images and identifies their differences. This is useful for understanding what changed between image versions, comparing images from different registries, or verifying image rebuilds. - -The command analyzes and compares: -- Image metadata (digests, creation dates) -- Layer differences (added, removed, modified) -- Size differences -- Configuration changes (environment variables, labels, entrypoints) -- Platform/architecture support -- Security and vulnerability differences (if scanning tools available) - -This command is useful for: -- Understanding changes between image versions -- Verifying image rebuilds match expectations -- Comparing images across registries (e.g., production vs staging) -- Identifying what layers changed in an update -- Troubleshooting deployment issues -- Security auditing and change tracking - -## Prerequisites - -**Required Tools:** - -1. **skopeo** - For image inspection and comparison - - Check if installed: `which skopeo` - - Installation: - - RHEL/Fedora: `sudo dnf install skopeo` - - Ubuntu/Debian: `sudo apt-get install skopeo` - - macOS: `brew install skopeo` - - Documentation: https://github.com/containers/skopeo - -**Optional Tools:** - -2. **podman** - For additional image analysis - - Useful for layer-by-layer comparison - - Installation: See `/container-image:inspect` prerequisites - -3. **dive** - For detailed layer analysis - - Check if installed: `which dive` - - Installation: https://github.com/wagoodman/dive - - Provides interactive layer comparison - -**Registry Authentication:** - -For private registries: -```bash -skopeo login registry.example.com -``` - -## Implementation - -The command performs the following comparison: - -1. **Check Tool Availability**: - - Verify `skopeo` is installed - - Check for optional tools (`podman`, `dive`) - -2. **Inspect Both Images**: - ```bash - skopeo inspect docker:// - skopeo inspect docker:// - ``` - -3. **Compare Basic Metadata**: - - Digests (are they the same image?) - - Creation timestamps - - Architecture and OS - - Manifest type (single vs manifest list) - -4. **Analyze Layer Differences**: - - Extract layer digests from both images - - Identify: - - **Common layers**: Layers shared between images - - **Added layers**: New layers in image2 - - **Removed layers**: Layers from image1 not in image2 - - **Modified layers**: Layers with same position but different content - - Calculate size differences - -5. **Compare Configuration**: - - Environment variables (added, removed, changed) - - Labels and annotations - - Exposed ports - - Entrypoint and command - - Working directory - - User/UID - - Volume mount points - -6. **Calculate Size Impact**: - - Total size difference - - Size added by new layers - - Size saved by removed layers - -7. **Present Structured Comparison**: - - Summary of differences - - Detailed breakdown by category - - Highlight significant changes - - Provide recommendations - -## Return Value - -The command outputs a structured comparison report: - -``` -================================================================================ -CONTAINER IMAGE COMPARISON -================================================================================ -Image 1: quay.io/openshift-release-dev/ocp-release:4.16.0 -Image 2: quay.io/openshift-release-dev/ocp-release:4.17.0 - -COMPARISON SUMMARY: - Images are: DIFFERENT - Digest match: NO - Architecture: Both linux/amd64 - -METADATA COMPARISON: - Attribute Image 1 Image 2 Change - ──────────────────────────────────────────────────────────────────────────────────────── - Digest sha256:abc123... sha256:def456... CHANGED - Created 2023-11-15T10:30:45Z 2024-01-15T10:30:45Z +61 days - Size 1.15 GB 1.22 GB +70 MB - -LAYER ANALYSIS: - Total Layers (Image 1): 15 - Total Layers (Image 2): 17 - - Common Layers: 12 layers (850 MB) - Added Layers: 5 layers (220 MB) - Removed Layers: 3 layers (150 MB) - - Layer Breakdown: - ✓ Layer 1-8: IDENTICAL (base layers) - + Layer 9: ADDED in Image 2 (45 MB) - New component added - - Layer 10: REMOVED from Image 1 (30 MB) - Old dependency removed - ✓ Layer 11-15: IDENTICAL - + Layer 16-17: ADDED in Image 2 (25 MB) - Updates - -CONFIGURATION DIFFERENCES: - - Environment Variables: - + OPENSHIFT_VERSION=4.17.0 (was: 4.16.0) - + NEW_FEATURE_FLAG=enabled (added) - - DEPRECATED_FLAG=true (removed) - - Labels: - + io.openshift.release=4.17.0 (was: 4.16.0) - + io.openshift.build-date=2024-01-15 (was: 2023-11-15) - - Exposed Ports: - ✓ 8080/tcp (unchanged) - ✓ 8443/tcp (unchanged) - - Entrypoint: - ✓ ["/usr/bin/entrypoint.sh"] (unchanged) - - Command: - - ["--legacy-mode"] (removed) - + ["--v2-mode"] (added) - -SIGNIFICANT CHANGES: - • Version upgrade: 4.16.0 → 4.17.0 - • Size increase: +70 MB (+6%) - • 5 new layers added - • 3 old layers removed - • Command-line arguments changed - • New feature flag enabled - -RECOMMENDATIONS: - • Review changelog for 4.16.0 → 4.17.0 upgrade - • Test with new command-line arguments (--v2-mode) - • Verify NEW_FEATURE_FLAG behavior in your environment - • Consider size impact (+70 MB) in constrained environments -================================================================================ -``` - -**For Identical Images:** -``` -================================================================================ -CONTAINER IMAGE COMPARISON -================================================================================ -Image 1: quay.io/myapp:v1.0.0 -Image 2: registry.example.com/myapp:v1.0.0 - -COMPARISON SUMMARY: - Images are: IDENTICAL - Digest match: YES (sha256:abc123...) - -These images are the same, just referenced from different registries. -No differences found. -================================================================================ -``` - -## Examples - -1. **Compare two versions of the same image**: - ``` - /container-image:compare quay.io/openshift-release-dev/ocp-release:4.16.0 quay.io/openshift-release-dev/ocp-release:4.17.0 - ``` - Shows what changed between OpenShift 4.16 and 4.17. - -2. **Compare production vs staging**: - ``` - /container-image:compare registry.prod.example.com/myapp:latest registry.staging.example.com/myapp:latest - ``` - Verifies staging matches production. - -3. **Compare images across registries**: - ``` - /container-image:compare docker.io/library/nginx:1.25 quay.io/nginx/nginx:1.25 - ``` - Checks if images from different registries are identical. - -4. **Verify image rebuild**: - ``` - /container-image:compare myapp:v1.0.0-original myapp:v1.0.0-rebuilt - ``` - Confirms rebuild produced the same image. - -5. **Compare by digest**: - ``` - /container-image:compare quay.io/myapp@sha256:abc123... quay.io/myapp@sha256:def456... - ``` - Compares specific image versions by digest. - -## Error Handling - -- **Image not found**: Verify both image references are correct -- **Authentication required**: Ensure you're logged into both registries -- **Network errors**: Check connectivity to both registries -- **Tool not available**: Provide installation instructions for `skopeo` -- **Different architectures**: Note when comparing images for different platforms - -## Notes - -- **Digest Comparison**: If digests match, images are identical -- **Layer Sharing**: Base layers are often shared between versions -- **Size Calculation**: Sizes shown are compressed (as stored in registry) -- **Semantic Versioning**: Helps identify major vs minor changes -- **Build Reproducibility**: Identical source should produce identical digests -- **Registry Metadata**: Some metadata may differ even if image content is identical - -## Advanced Usage - -**Compare Specific Architectures:** - -For manifest lists, you can compare specific platform variants: -```bash -# Compare amd64 variants -/container-image:compare quay.io/myapp:v1@sha256: quay.io/myapp:v2@sha256: -``` - -**Layer-by-Layer Analysis:** - -If `dive` is installed, the command can provide interactive layer comparison: -```bash -dive --compare -``` - -## Use Cases - -1. **Version Upgrades**: Understand what changed before upgrading -2. **Security Auditing**: Track changes to identify security implications -3. **Deployment Verification**: Confirm correct image is deployed -4. **Registry Migration**: Verify images copied between registries -5. **Build Debugging**: Identify why builds differ -6. **Compliance**: Document and track image changes - -## Arguments - -- **$1** (image1): Required. First image reference. - - Format: `[registry/]repository[:tag|@digest]` - -- **$2** (image2): Required. Second image reference. - - Format: `[registry/]repository[:tag|@digest]` - -**Note**: Images can be from the same or different registries. diff --git a/plugins/container-image/commands/inspect.md b/plugins/container-image/commands/inspect.md deleted file mode 100644 index b8ef92c5d..000000000 --- a/plugins/container-image/commands/inspect.md +++ /dev/null @@ -1,315 +0,0 @@ ---- -description: Inspect and provide detailed breakdown of a container image -argument-hint: ---- - -## Name -container-image:inspect - -## Synopsis -``` -/container-image:inspect -``` - -## Description - -The `container-image:inspect` command provides a comprehensive breakdown of a container image using `skopeo` and `podman`. It analyzes the image metadata, configuration, and layers to give you detailed information about the image structure, size, architecture, and contents. - -This command is useful for: -- Understanding image composition and layers -- Verifying image architecture and OS -- Checking image size and disk usage -- Inspecting image labels and annotations -- Validating image configuration -- Troubleshooting image-related issues -- Verifying multi-architecture image support -- Checking which platforms are available for an image -- Comparing platform-specific image differences -- Planning multi-arch image builds - -The command works with images from any registry (quay.io, docker.io, registry.redhat.io, etc.) and automatically detects whether an image is a manifest list (multi-architecture) or a single image, providing detailed analysis for both cases. - -## Prerequisites - -**Required Tools:** - -1. **skopeo** - For image inspection without pulling - - Check if installed: `which skopeo` - - Installation: - - RHEL/Fedora: `sudo dnf install skopeo` - - Ubuntu/Debian: `sudo apt-get install skopeo` - - macOS: `brew install skopeo` - - Documentation: https://github.com/containers/skopeo - -2. **podman** (Optional) - For additional image analysis - - Check if installed: `which podman` - - Installation: - - RHEL/Fedora: `sudo dnf install podman` - - Ubuntu/Debian: `sudo apt-get install podman` - - macOS: `brew install podman` - - Documentation: https://podman.io/ - -**Registry Authentication:** - -For private registries, ensure you're authenticated: -```bash -# Using skopeo -skopeo login registry.example.com - -# Using podman -podman login registry.example.com -``` - -## Implementation - -The command performs the following analysis steps: - -1. **Check Tool Availability**: - - Verify `skopeo` is installed - - Check for `podman` (optional but recommended) - - If tools are missing, provide installation instructions - -2. **Inspect Image Metadata with skopeo**: - ```bash - skopeo inspect docker:// - ``` - - This provides: - - Image digest and tags - - Architecture and OS - - Layer information - - Creation timestamp - - Labels and annotations - - Environment variables - - Exposed ports - - Entrypoint and command - -3. **Determine Image Type**: - - Check if the image is a **manifest list** (multi-arch) or a **single image** - - Fetch raw manifest to determine type: - ```bash - skopeo inspect --raw docker:// - ``` - - Parse `schemaVersion` and `mediaType` to identify: - - **Manifest List** (OCI Index): `application/vnd.oci.image.index.v1+json` - - **Manifest List** (Docker): `application/vnd.docker.distribution.manifest.list.v2+json` - - **Single Image** (OCI): `application/vnd.oci.image.manifest.v1+json` - - **Single Image** (Docker): `application/vnd.docker.distribution.manifest.v2+json` - -4. **Extract Manifest List Details** (if applicable): - - For manifest lists, extract platform information for each variant: - - Architecture (amd64, arm64, ppc64le, s390x, etc.) - - OS (linux, windows) - - Variant (v7, v8 for ARM) - - Digest of platform-specific image - - Size of platform-specific image - - Optionally inspect each platform variant: - ```bash - skopeo inspect docker://@ - ``` - - Compare platform differences: - - Image sizes across platforms - - Layer counts - - Creation timestamps - - Configuration differences - -5. **Analyze Image Layers**: - - List all layers with their sizes - - Calculate total image size - - Identify the largest layers - - Show layer history (if available) - -6. **Extract Configuration Details**: - - Operating system and distribution - - Architecture (amd64, arm64, ppc64le, s390x, etc.) - - Environment variables - - Working directory - - User/UID - - Exposed ports - - Volume mount points - - Labels (including OpenShift/Kubernetes metadata) - -7. **Infer Image Purpose**: - - Analyze image metadata to determine the likely purpose: - - Image name and repository patterns (e.g., "nginx", "postgres", "ocp-release") - - Labels (especially `io.openshift.*`, `io.k8s.*`, `org.opencontainers.*`) - - Entrypoint and command (what executable is being run) - - Exposed ports (common service ports) - - Environment variables (framework indicators, version info) - - Provide context about: - - What the image is (e.g., "web server", "database", "operator", "release payload") - - Common use cases - - Notable characteristics based on configuration - -8. **Present Organized Summary**: - - Image identity (digest, tags) - - Inferred purpose and context - - Basic information (OS, architecture, created date) - - Size breakdown - - Configuration summary - - Manifest list details (if applicable) - - Notable labels and annotations - -## Return Value - -The command outputs a structured breakdown of the image: - -``` -================================================================================ -CONTAINER IMAGE INSPECTION -================================================================================ -Image: quay.io/openshift-release-dev/ocp-release:4.20.0-multi - -IMAGE PURPOSE: - This is an OpenShift release image containing the cluster-version-operator - for OpenShift 4.20.0. It's part of the OpenShift release payload used to - manage cluster upgrades and version management. - -BASIC INFORMATION: - Manifest Digest: sha256:4f1e772349a20f2eb69e8cf70d73b4fcc299c15cb6e4f027696eb469e66d4080 - Type: Manifest List (Multi-Architecture) - Manifest Type: Docker Distribution Manifest List v2 - Created: 2025-10-16T13:35:26Z - -MANIFEST LIST DETAILS: - This is a multi-architecture manifest list containing 4 platform variants. - - AVAILABLE PLATFORMS (4): - -------------------------------------------------------------------------------- - 1. linux/amd64 - Digest: sha256:b4bd68afe0fb47bf9876f51e33d88e9dd218fed2dcf41b025740591746dda5c9 - Size: 167.6 MB (175,762,648 bytes) - Layers: 6 - Created: 2025-10-16T13:35:26Z - - 2. linux/arm64 - Digest: sha256:eec6b0e6ff1c4cf5edc158c41a171ac8b02d7e0389715b663528a4ec0931b1f2 - Size: 161.6 MB (169,501,175 bytes) - Layers: 6 - Created: 2025-10-16T13:35:26Z - - 3. linux/ppc64le - Digest: sha256:4bb9eb125d4d35c100699617ec8278691a9cee771ebacb113173b75f0707df56 - Size: 174.4 MB (182,863,818 bytes) - Layers: 6 - Created: 2025-10-16T13:35:26Z - - 4. linux/s390x - Digest: sha256:5e852c796f2d3b83b3bd4506973a455a521b6933e3944740b32c1ed483b2174e - Size: 163.2 MB (171,055,271 bytes) - Layers: 6 - Created: 2025-10-16T13:35:26Z - - PLATFORM COMPARISON: - Size Range: 161.6 MB - 174.4 MB (arm64 smallest, ppc64le largest) - Size Variance: ~12.8 MB difference between smallest and largest - Architectures: 4 platforms (amd64, arm64, ppc64le, s390x) - OS: linux (all) - Layer Count: 6 (all platforms) - Build Time: All platforms built simultaneously - - USAGE: - To pull a specific platform: - podman pull --platform=linux/amd64 quay.io/openshift-release-dev/ocp-release:4.20.0-multi - podman pull quay.io/openshift-release-dev/ocp-release@sha256:b4bd68afe0fb... # amd64 - -CONFIGURATION (amd64 example): - User: - WorkingDir: - Entrypoint: ["/usr/bin/cluster-version-operator"] - Cmd: - Env: - - PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - - BUILD_VERSION=v4.20.0 - - OS_GIT_VERSION=4.20.0-202509230726.p2.g9de00ba.assembly.stream.el9-9de00ba - -EXPOSED PORTS: - - -LABELS: - io.openshift.release: 4.20.0 - io.openshift.release.base-image-digest: sha256:6f58f521f51ae43617d2dead1efbe9690b605d646565892bb0f8c6030a742ba7 - -VOLUMES: - - -LAYER DETAILS (amd64): - Total Layers: 6 - Total Size: 167.6 MB (compressed) -================================================================================ -``` - -## Examples - -1. **Inspect a public image**: - ``` - /container-image:inspect quay.io/openshift-release-dev/ocp-release:4.17.0-x86_64 - ``` - Provides full breakdown of the OpenShift release image. - -2. **Inspect a manifest list**: - ``` - /container-image:inspect registry.redhat.io/ubi9/ubi:latest - ``` - Shows available architectures and platform-specific details. - -3. **Inspect with specific tag**: - ``` - /container-image:inspect docker.io/library/nginx:1.25 - ``` - Analyzes the nginx image with tag 1.25. - -4. **Inspect by digest**: - ``` - /container-image:inspect quay.io/prometheus/prometheus@sha256:abc123... - ``` - Inspects a specific image version by its digest. - -5. **Inspect a private registry image**: - ``` - /container-image:inspect registry.example.com/myorg/myapp:v1.0.0 - ``` - Analyzes an image from a private registry (requires authentication). - -## Error Handling - -- **Image not found**: If the image doesn't exist or the name is incorrect: - - Verify the image name and tag - - Check registry accessibility - - Ensure authentication is set up for private registries - -- **Tool not available**: If `skopeo` is not installed: - - Display installation instructions for the user's platform - - Suggest using `podman inspect` as an alternative (if podman is available) - -- **Authentication errors**: If registry requires authentication: - - Prompt user to run `skopeo login ` or `podman login ` - - Provide documentation link for registry authentication - -- **Network errors**: If registry is unreachable: - - Check internet connectivity - - Verify registry URL is correct - - Check for proxy/firewall issues - -## Notes - -- **No Image Pull Required**: `skopeo inspect` fetches metadata without downloading the entire image -- **Manifest Lists**: For multi-arch images, the command automatically detects and shows detailed platform information including per-platform digests, sizes, and configurations -- **Manifest List vs Single Image**: The command clearly distinguishes between manifest lists and single-architecture images -- **Platform Selection**: Container runtimes automatically select the correct platform from a manifest list -- **Digest Pinning**: Always displays the image digest for reproducible deployments -- **Label Standards**: Highlights important labels like OpenShift/Kubernetes metadata -- **Size Accuracy**: Layer sizes are compressed sizes as stored in the registry -- **Size Variations**: Platform-specific images may have different sizes due to architecture differences -- **OCI vs Docker**: Supports both OCI and Docker manifest formats -- **Variant Field**: ARM images may have variants (v7, v8) for different ARM versions -- **Registry Support**: Works with any OCI-compliant registry - -## Arguments - -- **$1** (image): Required. The full image reference including registry, repository, and tag/digest. - - Format: `[registry/]repository[:tag|@digest]` - - Examples: - - `quay.io/openshift/origin-node:latest` - - `docker.io/library/alpine:3.18` - - `registry.redhat.io/ubi9/ubi@sha256:abc123...` diff --git a/plugins/container-image/commands/tags.md b/plugins/container-image/commands/tags.md deleted file mode 100644 index 02fbd9345..000000000 --- a/plugins/container-image/commands/tags.md +++ /dev/null @@ -1,310 +0,0 @@ ---- -description: List and analyze available tags for a container image repository -argument-hint: ---- - -## Name -container-image:tags - -## Synopsis -``` -/container-image:tags -``` - -## Description - -The `container-image:tags` command lists and analyzes all available tags for a container image repository. It provides detailed information about each tag including creation date, size, architecture support, and digest. - -This command helps you: -- Discover available image versions -- Identify the latest stable releases -- Find images for specific architectures -- Track image update frequency -- Identify deprecated or outdated tags -- Plan image upgrades -- Understand tagging conventions - -The command works with any OCI-compliant registry and can filter, sort, and analyze tags based on various criteria. - -## Prerequisites - -**Required Tools:** - -1. **skopeo** - For listing and inspecting tags - - Check if installed: `which skopeo` - - Installation: - - RHEL/Fedora: `sudo dnf install skopeo` - - Ubuntu/Debian: `sudo apt-get install skopeo` - - macOS: `brew install skopeo` - - Documentation: https://github.com/containers/skopeo - -**Registry Authentication:** - -For private registries: -```bash -skopeo login registry.example.com -``` - -## Implementation - -The command performs the following analysis: - -1. **Check Tool Availability**: - - Verify `skopeo` is installed - - If missing, provide installation instructions - -2. **List All Tags**: - ```bash - skopeo list-tags docker:// - ``` - - This returns all available tags for the repository. - -3. **Inspect Each Tag** (for detailed analysis): - For each tag (or a sample of tags for large repositories): - ```bash - skopeo inspect docker://: - ``` - - Extract: - - Image digest - - Creation date - - Size - - Architecture(s) - - Labels - - Manifest type - -4. **Categorize Tags**: - - **Version tags**: Semantic versions (v1.0.0, 2.1.3) - - **Latest tags**: Tags like `latest`, `stable`, `production` - - **Date-based tags**: Tags with dates (20240115, 2024-01-15) - - **Branch tags**: Development branches (main, develop) - - **SHA tags**: Git commit SHAs - - **Custom tags**: Other tagging schemes - -5. **Sort and Filter**: - - Sort by creation date (newest first) - - Sort by semantic version - - Filter by pattern (e.g., only `v4.*` tags) - - Filter by architecture support - - Show only recent tags (e.g., last 30 days) - -6. **Identify Key Tags**: - - Current `latest` tag - - Most recent version tag - - Long-term support (LTS) tags - - Deprecated tags - - Duplicate tags (same digest, different names) - -7. **Present Organized Analysis**: - - Summary of tag categories - - Detailed tag list with metadata - - Recommendations for tag selection - - Notable patterns or issues - -## Return Value - -The command outputs a structured tag listing: - -``` -================================================================================ -CONTAINER IMAGE TAGS -================================================================================ -Repository: quay.io/openshift-release-dev/ocp-release - -Total Tags: 487 - -TAG SUMMARY: - Version Tags: 312 (e.g., 4.17.0, 4.16.1) - Date Tags: 150 (e.g., 2024-01-15) - Latest Tags: 3 (latest, stable, production) - Other Tags: 22 - -RECENT TAGS (Last 30 days): --------------------------------------------------------------------------------- -TAG CREATED SIZE ARCH DIGEST -4.17.0 2024-01-15 10:30 1.2 GB multi sha256:abc123... -4.17.0-rc.1 2024-01-10 08:15 1.2 GB multi sha256:def456... -4.16.2 2024-01-08 14:22 1.1 GB multi sha256:ghi789... -latest 2024-01-15 10:30 1.2 GB multi sha256:abc123... -stable 2024-01-08 14:22 1.1 GB multi sha256:ghi789... - -VERSION TAGS (Semantic): --------------------------------------------------------------------------------- -4.17.0 2024-01-15 1.2 GB multi sha256:abc123... [LATEST] -4.17.0-rc.1 2024-01-10 1.2 GB multi sha256:def456... -4.16.2 2024-01-08 1.1 GB multi sha256:ghi789... -4.16.1 2023-12-20 1.1 GB multi sha256:jkl012... -4.16.0 2023-12-01 1.1 GB multi sha256:mno345... -4.15.18 2023-11-28 1.0 GB multi sha256:pqr678... -... - -SPECIAL TAGS: --------------------------------------------------------------------------------- -latest → 4.17.0 (sha256:abc123...) -stable → 4.16.2 (sha256:ghi789...) -lts → 4.15.18 (sha256:pqr678...) - -ARCHITECTURE SUPPORT: - Multi-arch tags: 465 (linux/amd64, linux/arm64, linux/ppc64le, linux/s390x) - Single-arch: 22 (linux/amd64 only) - -DUPLICATE TAGS (same image, multiple tags): - 4.17.0 = latest = 2024-01-15 (sha256:abc123...) - 4.16.2 = stable (sha256:ghi789...) - -TAG PATTERNS: - • Semantic versioning (4.x.y) - • Release candidates (-rc.x) - • Date-based snapshots (YYYY-MM-DD) - • Architecture-specific suffixes (-amd64, -arm64) - -RECOMMENDATIONS: - • For production: Use stable (4.16.2) or specific version tag - • For testing: Use latest (4.17.0) - • For LTS: Use lts (4.15.18) - • Avoid: Using generic tags like 'latest' in production - • Pin by digest: Use @sha256:abc123... for reproducibility - -NOTABLE: - • 3 tags updated in the last 7 days - • 15 release candidates available - • Average tag age: 45 days - • Update frequency: ~2 tags per week -================================================================================ -``` - -**For Small Repositories:** -``` -================================================================================ -CONTAINER IMAGE TAGS -================================================================================ -Repository: docker.io/library/alpine - -Total Tags: 47 - -ALL TAGS: --------------------------------------------------------------------------------- -TAG CREATED SIZE ARCH DIGEST -latest 2024-01-20 12:00 7.3 MB multi sha256:abc123... -3.19 2024-01-20 12:00 7.3 MB multi sha256:abc123... -3.18 2023-11-15 09:30 7.0 MB multi sha256:def456... -3.17 2023-08-10 14:15 6.8 MB multi sha256:ghi789... -edge 2024-01-22 08:00 7.5 MB multi sha256:jkl012... -... - -RECOMMENDATIONS: - • For production: Use 3.19 (latest stable) - • For edge features: Use edge - • For compatibility: Use 3.18 or 3.17 -================================================================================ -``` - -## Examples - -1. **List tags for OpenShift release images**: - ``` - /container-image:tags quay.io/openshift-release-dev/ocp-release - ``` - Shows all available OpenShift release versions. - -2. **Check available UBI tags**: - ``` - /container-image:tags registry.redhat.io/ubi9/ubi - ``` - Lists all UBI 9 image tags. - -3. **Explore nginx versions**: - ``` - /container-image:tags docker.io/library/nginx - ``` - Shows available nginx versions and variants. - -4. **Check private repository tags**: - ``` - /container-image:tags registry.example.com/myorg/myapp - ``` - Lists tags from a private registry (requires authentication). - -5. **Analyze Prometheus tags**: - ``` - /container-image:tags quay.io/prometheus/prometheus - ``` - Shows Prometheus versions and release patterns. - -## Advanced Options - -The command can support optional filters and sorting: - -**Filter by Pattern:** -``` -/container-image:tags quay.io/openshift-release-dev/ocp-release --filter "4.17.*" -``` -Shows only 4.17.x tags. - -**Limit Results:** -``` -/container-image:tags docker.io/library/alpine --limit 10 -``` -Shows only the 10 most recent tags. - -**Sort Options:** -``` -/container-image:tags quay.io/myapp --sort version # Semantic version sort -/container-image:tags quay.io/myapp --sort date # Creation date sort -/container-image:tags quay.io/myapp --sort size # Size sort -``` - -**Architecture Filter:** -``` -/container-image:tags registry.example.com/myapp --arch arm64 -``` -Shows only tags that support arm64. - -## Error Handling - -- **Repository not found**: Verify repository name and registry -- **Authentication required**: Guide user to login with `skopeo login` -- **Network errors**: Check connectivity and registry availability -- **Tool not available**: Provide installation instructions for `skopeo` -- **Rate limiting**: Handle registry rate limits gracefully -- **Large repositories**: For repositories with 1000+ tags, sample or paginate results - -## Notes - -- **Tag Mutability**: Tags (except digests) can be reassigned to different images -- **Latest Tag**: "latest" doesn't always mean newest; it's just a convention -- **Digest Pinning**: For reproducible deployments, always use digest (@sha256:...) -- **Semantic Versioning**: Many projects follow semver (MAJOR.MINOR.PATCH) -- **Multi-arch Support**: Check which tags support your target architecture -- **Deprecation**: Older tags may be removed; check registry retention policies - -## Performance Considerations - -For repositories with many tags: -- The command samples tags rather than inspecting all -- Full inspection can be requested with `--full` flag -- Results can be cached for repeated queries -- Pagination is used for very large tag lists - -## Use Cases - -1. **Version Discovery**: Find the latest stable version before deployment -2. **Update Planning**: Identify available updates for current images -3. **Architecture Planning**: Verify multi-arch support before migration -4. **Cleanup Planning**: Identify old/unused tags for cleanup -5. **Compliance**: Document available versions for audit trails -6. **CI/CD Integration**: Automate image version selection -7. **Troubleshooting**: Compare production tag with available versions - -## Arguments - -- **$1** (repository): Required. The repository path (without tag). - - Format: `[registry/]repository` - - Examples: - - `quay.io/openshift-release-dev/ocp-release` - - `docker.io/library/nginx` - - `registry.redhat.io/ubi9/ubi` - - `registry.example.com/myorg/myapp` - -**Note**: Do NOT include the tag (`:tagname`) in the repository argument. diff --git a/plugins/doc/.claude-plugin/plugin.json b/plugins/doc/.claude-plugin/plugin.json deleted file mode 100644 index 3eee66964..000000000 --- a/plugins/doc/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "doc", - "description": "A plugin for engineering documentation and notes", - "version": "0.0.2", - "author": { - "name": "OpenShift Engineering" - } -} diff --git a/plugins/doc/README.md b/plugins/doc/README.md deleted file mode 100644 index 3954ba0d7..000000000 --- a/plugins/doc/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Doc Plugin - -Engineering documentation and note-taking utilities for Claude Code. - -## Commands - -### `/doc:note` - -Create and manage engineering notes and documentation. - -See [commands/note.md](commands/note.md) for full documentation. - -## Installation - -```bash -/plugin install doc@ai-helpers -``` - diff --git a/plugins/doc/commands/note.md b/plugins/doc/commands/note.md deleted file mode 100644 index 7dc3547a5..000000000 --- a/plugins/doc/commands/note.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -description: Generate professional engineering notes and append them to a log file -argument-hint: "[task description]" ---- - -## Name -doc:note - -## Synopsis -``` -/doc:note [task description] -``` - -## Description -The `doc:note` command generates professional engineering notes about completed tasks and appends them to a persistent log file (`~/engineering-notes.md`). It automatically searches for relevant context including GitHub PR URLs, issue numbers, and Jira ticket references in the conversation history and git repository. - -This command helps engineers maintain a structured record of their daily work, making it easier to: -- Track accomplishments for performance reviews -- Generate status reports and weekly updates -- Maintain a searchable history of technical decisions -- Document completed work with proper attribution - -## Implementation -The command performs the following steps: -1. **Context gathering**: Searches conversation history for GitHub PR URLs, issue numbers, or Jira ticket keys (e.g., PROJ-123) -2. **Git analysis**: If in a git repository, checks recent commits and current branch name for references -3. **Note generation**: Creates a 1-2 sentence note with: - - Today's date in YYYY-MM-DD format - - Accomplishment framed in past tense - - Technical details and specific technologies used - - Impact and value delivered - - All relevant links inline -4. **File management**: Appends the note to `~/engineering-notes.md` (creates file if it doesn't exist) with proper spacing - -If the task description argument is omitted, the command will attempt to discover a task description from relevant context (e.g. git repository status and conversation history). If no relevant context is discovered, or if more information is needed, the command will prompt for further context. - -## Return Value -- **Success**: Confirmation message with the generated note -- **File created**: `~/engineering-notes.md` (if it didn't exist) -- **File updated**: Note appended with blank line separator - -## Examples - -1. **Basic usage with task description**: - ``` - /doc:note Implemented user authentication with OAuth2 - ``` - Generates: - ``` - 2025-10-24 - Implemented user authentication using OAuth2. Integrated with Google and GitHub providers, added JWT token management, and secured API endpoints with role-based access control. - - ``` - -2. **Without task description (auto-discovers from context)**: - ``` - /doc:note - ``` - The command analyzes git repository and conversation history to generate a note. If insufficient context is available, it will prompt for details. - -3. **With git context**: - ``` - /doc:note Fixed critical bug in payment processor - ``` - If on a branch named `fix/payment-timeout` with recent commits, generates: - ``` - 2025-10-24 - Fixed critical timeout bug in payment processor (PR #123). Optimized database queries and added connection pooling, reducing payment processing time by 60% and eliminating timeout errors. - - ``` - -## Arguments -- `[task description]`: Optional description of the completed task. If omitted, the command attempts to discover context automatically. diff --git a/plugins/etcd/.claude-plugin/plugin.json b/plugins/etcd/.claude-plugin/plugin.json deleted file mode 100644 index 15de5cd7d..000000000 --- a/plugins/etcd/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "etcd", - "description": "Etcd cluster health monitoring and performance analysis utilities", - "version": "0.0.2", - "author": { - "name": "github.com/openshift-eng" - } -} diff --git a/plugins/etcd/README.md b/plugins/etcd/README.md deleted file mode 100644 index 79c981ee3..000000000 --- a/plugins/etcd/README.md +++ /dev/null @@ -1,170 +0,0 @@ -# Etcd Plugin - -A Claude Code plugin for monitoring etcd cluster health and analyzing performance in OpenShift environments. - -## Overview - -This plugin provides commands to help diagnose and troubleshoot etcd-related issues in OpenShift clusters. Etcd is the critical distributed key-value store that holds all cluster state for Kubernetes/OpenShift, and maintaining its health and performance is essential for cluster stability. - -## Commands - -### `/etcd:health-check` - -Performs a comprehensive health check of the etcd cluster, examining: -- Etcd pod status and availability -- Cluster health and member status -- Leadership election status -- Database size and fragmentation -- Disk space utilization -- Recent error logs -- Performance metrics (with `--verbose` flag) - -**Usage:** -``` -/etcd:health-check [--verbose] -``` - -**Example:** -``` -/etcd:health-check -/etcd:health-check --verbose -``` - -### `/etcd:analyze-performance` - -Analyzes etcd performance metrics to identify latency issues and bottlenecks, including: -- Disk I/O performance (commit latency, fsync duration) -- Network latency between etcd peers -- Request/response performance by operation type -- Leader stability and proposal metrics -- Database size and fragmentation -- Performance warnings from logs - -**Usage:** -``` -/etcd:analyze-performance [--duration ] -``` - -**Example:** -``` -/etcd:analyze-performance -/etcd:analyze-performance --duration 15 -``` - -## Prerequisites - -All commands require: - -1. **OpenShift CLI (oc)** - Install from https://mirror.openshift.com/pub/openshift-v4/clients/ocp/ -2. **Active cluster connection** - Must be authenticated to an OpenShift cluster -3. **Cluster admin permissions** - Required to access etcd pods and metrics -4. **Running etcd pods** - At least one etcd pod must be running - -## Installation - -### From Marketplace - -```bash -# Add the marketplace (if not already added) -/plugin marketplace add openshift-eng/ai-helpers - -# Install the etcd plugin -/plugin install etcd@ai-helpers -``` - -### Manual Installation - -```bash -# Clone the repository -git clone https://github.com/openshift-eng/ai-helpers.git - -# Link to your Claude Code plugins directory -ln -s $(pwd)/ai-helpers/plugins/etcd ~/.claude/plugins/etcd -``` - -## Use Cases - -### Troubleshooting Cluster Issues - -When experiencing cluster-wide problems: -1. Run `/etcd:health-check` to verify etcd cluster status -2. If issues are found, run `/etcd:analyze-performance` to identify bottlenecks -3. Follow the recommendations provided in the output - -### Performance Tuning - -For proactive performance monitoring: -1. Run `/etcd:analyze-performance --duration 30` for comprehensive analysis -2. Review disk I/O and network latency metrics -3. Compare against recommended thresholds -4. Implement suggested optimizations - -### Capacity Planning - -Before scaling operations: -1. Check current database size with `/etcd:health-check` -2. Analyze performance trends with `/etcd:analyze-performance` -3. Identify if hardware upgrades are needed - -## Common Issues and Solutions - -### High Disk Latency - -**Problem:** Backend commit P99 > 100ms or WAL fsync P99 > 10ms - -**Solutions:** -- Migrate to SSD or NVMe storage -- Use dedicated disks for etcd (not shared with OS) -- Check for competing I/O workloads - -### Frequent Leader Changes - -**Problem:** Leader changes > 5 - -**Solutions:** -- Check network connectivity between etcd nodes -- Ensure nodes are in same datacenter/availability zone -- Verify no clock skew between nodes - -### Large Database Size - -**Problem:** Database size > 8GB or high fragmentation - -**Solutions:** -- Run etcd defragmentation -- Review event retention policies -- Check for excessive key creation - -## Performance Benchmarks - -Recommended thresholds for healthy etcd: -- **Backend commit P99:** < 100ms -- **WAL fsync P99:** < 10ms -- **Peer RTT P99:** < 50ms -- **Leader changes:** < 5 total -- **Database size:** < 8GB -- **Disk usage:** < 80% - -## Security Considerations - -- Commands require cluster-admin or equivalent permissions -- Access to etcd allows viewing all cluster secrets -- Metrics and logs may contain sensitive information -- Performance data should be treated as confidential - -## Resources - -- **Etcd Documentation:** https://etcd.io/docs/ -- **OpenShift Etcd Docs:** https://docs.openshift.com/container-platform/latest/backup_and_restore/control_plane_backup_and_restore/ -- **Performance Tuning:** https://etcd.io/docs/latest/tuning/ - -## Contributing - -To contribute improvements or report issues: -1. Visit https://github.com/openshift-eng/ai-helpers -2. Open an issue or pull request -3. Follow the contribution guidelines in the repository - -## License - -This plugin is part of the ai-helpers project and follows the same license terms. diff --git a/plugins/etcd/commands/analyze-performance.md b/plugins/etcd/commands/analyze-performance.md deleted file mode 100644 index 58d523a81..000000000 --- a/plugins/etcd/commands/analyze-performance.md +++ /dev/null @@ -1,602 +0,0 @@ ---- -description: Analyze etcd performance metrics, latency, and identify bottlenecks -argument-hint: "[--duration ]" ---- - -## Name -etcd:analyze-performance - -## Synopsis -``` -/etcd:analyze-performance [--duration ] -``` - -## Description - -The `analyze-performance` command analyzes etcd performance metrics to identify latency issues, slow operations, and potential bottlenecks. It examines disk performance, commit latency, network latency, and provides recommendations for optimization. - -Etcd performance is critical for cluster responsiveness. Slow etcd operations can cause: -- API server timeouts -- Slow pod creation and updates -- Controller delays -- Overall cluster sluggishness - -This command is useful for: -- Diagnosing slow cluster operations -- Identifying disk I/O bottlenecks -- Detecting network latency issues -- Capacity planning -- Performance tuning - -## Prerequisites - -Before using this command, ensure you have: - -1. **OpenShift CLI (oc)** - - Install from: https://mirror.openshift.com/pub/openshift-v4/clients/ocp/ - - Verify with: `oc version` - -2. **Active cluster connection** - - Must be connected to an OpenShift cluster - - Verify with: `oc whoami` - -3. **Cluster admin permissions** - - Required to access etcd pods and metrics - - Verify with: `oc auth can-i get pods -n openshift-etcd` - -4. **Running etcd pods** - - At least one etcd pod must be running - - Check with: `oc get pods -n openshift-etcd -l app=etcd` - -## Arguments - -- **--duration** (optional): Duration in minutes to analyze logs (default: 5) - - Analyzes recent logs for the specified duration - - Longer durations provide more comprehensive analysis - - Example: `--duration 15` for 15-minute window - -## Implementation - -The command performs the following analysis: - -### 1. Verify Prerequisites - -```bash -if ! command -v oc &> /dev/null; then - echo "Error: oc CLI not found" - exit 1 -fi - -if ! oc whoami &> /dev/null; then - echo "Error: Not connected to cluster" - exit 1 -fi - -# Parse duration argument (default: 5 minutes) -DURATION=5 -if [[ "$1" == "--duration" ]] && [[ -n "$2" ]]; then - DURATION=$2 -fi - -echo "Analyzing etcd performance (last $DURATION minutes)..." -``` - -### 2. Get Running Etcd Pod - -```bash -ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') - -if [ -z "$ETCD_POD" ]; then - echo "Error: No running etcd pod found" - exit 1 -fi - -echo "Using etcd pod: $ETCD_POD" -echo "" -``` - -### 3. Analyze Database Performance - -Get database statistics using etcdctl: - -```bash -echo "===============================================" -echo "DATABASE PERFORMANCE ANALYSIS" -echo "===============================================" -echo "" -echo "Fetching database statistics..." - -# Get database sizes from endpoint status -DB_STATUS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w json 2>/dev/null) - -echo "Database Statistics:" -echo "$DB_STATUS" | jq -r '.[] | - "Endpoint: \(.Endpoint) - Version: \(.Status.version) - DB Size: \(.Status.dbSize) bytes (\((.Status.dbSize / 1024 / 1024) | floor)MB) - DB In Use: \(.Status.dbSizeInUse) bytes (\((.Status.dbSizeInUse / 1024 / 1024) | floor)MB) - Keys: \(.Status.header.revision) - Raft Index: \(.Status.raftIndex) - Raft Term: \(.Status.raftTerm) - Leader: \(if .Status.leader == .Status.header.member_id then "YES" else "NO" end) -"' - -echo "" -echo "Fragmentation Analysis:" -echo "$DB_STATUS" | jq -r '.[] | - if .Status.dbSize > 0 then - ((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) as $frag | - "Endpoint: \(.Endpoint) - Fragmentation: \($frag | floor)%" + - if $frag > 50 then - " - WARNING: High fragmentation detected, consider defragmentation" - elif $frag > 30 then - " - NOTICE: Moderate fragmentation" - else - " - OK" - end - else - "Endpoint: \(.Endpoint) - Fragmentation: N/A" - end' -``` - -### 4. Check Cluster Health - -Verify etcd cluster health: - -```bash -echo "" -echo "===============================================" -echo "CLUSTER HEALTH" -echo "===============================================" -echo "" -oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster 2>/dev/null || echo "Health check failed" -``` - -### 5. Analyze Logs for Performance Issues - -Parse etcd logs for performance warnings: - -```bash -echo "" -echo "===============================================" -echo "LOG ANALYSIS (Last $DURATION minutes)" -echo "===============================================" -echo "" -echo "Searching for performance-related warnings..." - -# Get recent logs -LOGS=$(oc logs -n openshift-etcd "$ETCD_POD" -c etcd --since="${DURATION}m" 2>/dev/null) - -# Count slow operations -SLOW_OPS=$(echo "$LOGS" | grep -i "slow" | wc -l) -echo "Slow operations logged: $SLOW_OPS" - -if [ "$SLOW_OPS" -gt 0 ]; then - echo "" - echo "Recent slow operations (last 10):" - echo "$LOGS" | grep -i "slow" | tail -10 -fi - -echo "" - -# Check for disk warnings -DISK_WARNINGS=$(echo "$LOGS" | grep -iE "disk|fdatasync|fsync" | grep -iE "slow|took|latency" | wc -l) -echo "Disk-related warnings: $DISK_WARNINGS" - -if [ "$DISK_WARNINGS" -gt 0 ]; then - echo "" - echo "Disk performance warnings:" - echo "$LOGS" | grep -iE "disk|fdatasync|fsync" | grep -iE "slow|took|latency" | tail -5 -fi - -echo "" - -# Check for apply warnings -APPLY_WARNINGS=$(echo "$LOGS" | grep -iE "apply.*took|slow.*apply" | wc -l) -echo "Apply operation warnings: $APPLY_WARNINGS" - -if [ "$APPLY_WARNINGS" -gt 0 ]; then - echo "" - echo "Apply warnings:" - echo "$LOGS" | grep -iE "apply.*took|slow.*apply" | tail -5 -fi - -echo "" - -# Check for compaction info -echo "Recent compaction operations:" -echo "$LOGS" | grep "finished scheduled compaction" | tail -3 -if [ $(echo "$LOGS" | grep "finished scheduled compaction" | wc -l) -eq 0 ]; then - echo " No compaction operations in this time window" -fi - -echo "" - -# Check for snapshot operations -echo "Snapshot operations:" -SNAPSHOTS=$(echo "$LOGS" | grep -i "snapshot" | wc -l) -echo "Snapshot events: $SNAPSHOTS" -if [ "$SNAPSHOTS" -gt 0 ]; then - echo "$LOGS" | grep -i "snapshot" | tail -3 -fi -``` - -### 6. Analyze Leader Stability - -Check for leader changes and stability issues: - -```bash -echo "" -echo "===============================================" -echo "LEADER STABILITY ANALYSIS" -echo "===============================================" -echo "" - -LEADER_CHANGES=$(echo "$LOGS" | grep -i "leader.*changed\|became leader\|lost leader" | wc -l) -echo "Leader change events: $LEADER_CHANGES" - -if [ "$LEADER_CHANGES" -gt 0 ]; then - echo "" - echo "Leader change events:" - echo "$LOGS" | grep -i "leader.*changed\|became leader\|lost leader" -fi - -# Check for proposal/commit issues -echo "" -echo "Proposal and commit operations:" -PROPOSAL_LOGS=$(echo "$LOGS" | grep -iE "proposal|commit" | grep -iE "slow|took|failed" | wc -l) -echo "Slow proposal/commit operations: $PROPOSAL_LOGS" - -if [ "$PROPOSAL_LOGS" -gt 0 ]; then - echo "" - echo "Sample slow operations:" - echo "$LOGS" | grep -iE "proposal|commit" | grep -iE "slow|took|failed" | tail -5 -fi -``` - -### 7. Analyze Network Performance - -Check for network-related issues: - -```bash -echo "" -echo "===============================================" -echo "NETWORK ANALYSIS" -echo "===============================================" -echo "" - -NETWORK_ISSUES=$(echo "$LOGS" | grep -iE "network|connection|timeout|peer" | grep -iE "error|fail|slow" | wc -l) -echo "Network-related issues: $NETWORK_ISSUES" - -if [ "$NETWORK_ISSUES" -gt 0 ]; then - echo "" - echo "Network issues:" - echo "$LOGS" | grep -iE "network|connection|timeout|peer" | grep -iE "error|fail|slow" | tail -5 -fi -``` - -### 8. Generate Performance Summary - -Create summary with recommendations: - -```bash -echo "" -echo "===============================================" -echo "PERFORMANCE SUMMARY & RECOMMENDATIONS" -echo "===============================================" -echo "" - -ISSUES=0 -WARNINGS=0 - -# Check fragmentation from DB status -MAX_FRAG=$(echo "$DB_STATUS" | jq -r '[.[] | if .Status.dbSize > 0 then ((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) else 0 end] | max') - -if (( $(echo "$MAX_FRAG > 50" | bc -l 2>/dev/null || echo 0) )); then - echo "ISSUE: High database fragmentation (${MAX_FRAG}%)" - echo " Recommendation: Run defragmentation on all etcd members" - echo " Command: oc exec -n openshift-etcd -c etcdctl -- etcdctl defrag" - echo "" - ISSUES=$((ISSUES + 1)) -elif (( $(echo "$MAX_FRAG > 30" | bc -l 2>/dev/null || echo 0) )); then - echo "WARNING: Moderate database fragmentation (${MAX_FRAG}%)" - echo " Recommendation: Monitor and consider defragmentation if performance degrades" - echo "" - WARNINGS=$((WARNINGS + 1)) -fi - -if [ "$LEADER_CHANGES" -gt 5 ]; then - echo "WARNING: Frequent leader changes ($LEADER_CHANGES in last ${DURATION}m)" - echo " Recommendation: Check network stability between etcd nodes" - echo " - Verify network latency between control plane nodes" - echo " - Check for packet loss or network congestion" - echo "" - WARNINGS=$((WARNINGS + 1)) -fi - -if [ "$SLOW_OPS" -gt 10 ]; then - echo "WARNING: High number of slow operations ($SLOW_OPS in last ${DURATION}m)" - echo " Recommendation: Investigate disk I/O and workload patterns" - echo " - Check disk performance with 'fio' benchmarks" - echo " - Review etcd workload and consider optimization" - echo "" - WARNINGS=$((WARNINGS + 1)) -fi - -if [ "$DISK_WARNINGS" -gt 5 ]; then - echo "WARNING: Multiple disk performance warnings ($DISK_WARNINGS in last ${DURATION}m)" - echo " Recommendation: Investigate disk I/O performance" - echo " - Ensure etcd is using SSD/NVMe storage" - echo " - Check for disk saturation or competing I/O" - echo " - Verify disk benchmarks meet etcd requirements (> 50 sequential IOPS)" - echo "" - WARNINGS=$((WARNINGS + 1)) -fi - -# Get average DB size -AVG_DB_SIZE=$(echo "$DB_STATUS" | jq -r '[.[] | .Status.dbSize] | add / length') -AVG_DB_SIZE_MB=$(echo "scale=0; $AVG_DB_SIZE / 1024 / 1024" | bc) - -if [ "$AVG_DB_SIZE_MB" -gt 8000 ]; then - echo "WARNING: Large database size (${AVG_DB_SIZE_MB}MB)" - echo " Recommendation: Review data retention and compaction policies" - echo " - Check event retention policies" - echo " - Consider more frequent compaction" - echo "" - WARNINGS=$((WARNINGS + 1)) -fi - -echo "Performance Metrics Summary:" -echo " - Database size: ${AVG_DB_SIZE_MB}MB (recommended: < 8GB)" -echo " - Fragmentation: ${MAX_FRAG}% (recommended: < 30%)" -echo " - Slow operations (${DURATION}m): $SLOW_OPS (recommended: < 10)" -echo " - Leader changes (${DURATION}m): $LEADER_CHANGES (recommended: < 5)" -echo "" - -if [ "$ISSUES" -eq 0 ] && [ "$WARNINGS" -eq 0 ]; then - echo "Status: ✓ HEALTHY - Performance within acceptable ranges" - exit 0 -elif [ "$ISSUES" -gt 0 ]; then - echo "Status: ✗ CRITICAL - Found $ISSUES performance issues requiring attention" - exit 1 -else - echo "Status: ⚠ WARNING - Found $WARNINGS performance warnings" - exit 0 -fi -``` - -## Return Value - -- **Exit 0**: Performance is acceptable (may have warnings) -- **Exit 1**: Critical performance issues detected - -**Output Format**: -- Structured sections for different performance aspects -- Metrics with percentile values (P50, P99) -- Warnings for values exceeding thresholds -- Recommendations for remediation - -## Examples - -### Example 1: Basic performance analysis -``` -/etcd:analyze-performance -``` - -Output: -``` -=============================================== -ETCD PERFORMANCE ANALYSIS -=============================================== -Analyzing etcd performance (last 5 minutes)... -Using etcd pod: etcd-dis016-p6vvv-master-0.us-central1-a.c.openshift-qe.internal - -=============================================== -DATABASE PERFORMANCE ANALYSIS -=============================================== - -Fetching database statistics... -Database Statistics: -Endpoint: https://10.0.0.5:2379 - Version: 3.5.24 - DB Size: 94941184 bytes (90MB) - DB In Use: 51789824 bytes (49MB) - Keys: 50240 - Raft Index: 57097 - Raft Term: 8 - Leader: YES - -Endpoint: https://10.0.0.3:2379 - Version: 3.5.24 - DB Size: 95363072 bytes (90MB) - DB In Use: 51789824 bytes (49MB) - Keys: 50240 - Raft Index: 57097 - Raft Term: 8 - Leader: NO - -Endpoint: https://10.0.0.6:2379 - Version: 3.5.24 - DB Size: 94613504 bytes (90MB) - DB In Use: 51834880 bytes (49MB) - Keys: 50240 - Raft Index: 57097 - Raft Term: 8 - Leader: NO - -Fragmentation Analysis: -Endpoint: https://10.0.0.5:2379 - Fragmentation: 45% - NOTICE: Moderate fragmentation -Endpoint: https://10.0.0.3:2379 - Fragmentation: 45% - NOTICE: Moderate fragmentation -Endpoint: https://10.0.0.6:2379 - Fragmentation: 45% - NOTICE: Moderate fragmentation - -=============================================== -CLUSTER HEALTH -=============================================== - -https://10.0.0.5:2379 is healthy: successfully committed proposal: took = 9.848973ms -https://10.0.0.3:2379 is healthy: successfully committed proposal: took = 14.309216ms -https://10.0.0.6:2379 is healthy: successfully committed proposal: took = 14.829731ms - -=============================================== -LOG ANALYSIS (Last 5 minutes) -=============================================== - -Searching for performance-related warnings... -Slow operations logged: 0 -Disk-related warnings: 0 -Apply operation warnings: 0 - -Recent compaction operations: -{"level":"info","ts":"2025-11-19T06:15:10.136401Z","caller":"mvcc/kvstore_compaction.go:72","msg":"finished scheduled compaction","compact-revision":48026,"took":"175.577699ms","hash":1330697744} - -=============================================== -LEADER STABILITY ANALYSIS -=============================================== - -Leader change events: 0 - -=============================================== -NETWORK ANALYSIS -=============================================== - -Network-related issues: 0 - -=============================================== -PERFORMANCE SUMMARY & RECOMMENDATIONS -=============================================== - -WARNING: Moderate database fragmentation (45%) - Recommendation: Monitor and consider defragmentation if performance degrades - -Performance Metrics Summary: - - Database size: 90MB (recommended: < 8GB) - - Fragmentation: 45% (recommended: < 30%) - - Slow operations (5m): 0 (recommended: < 10) - - Leader changes (5m): 0 (recommended: < 5) - -Status: ⚠ WARNING - Found 1 performance warnings -``` - -### Example 2: Extended analysis window -``` -/etcd:analyze-performance --duration 30 -``` - -## Common Performance Issues - -### High Database Fragmentation - -**Symptoms**: Database size significantly larger than in-use size (>30% fragmentation) - -**Investigation**: -```bash -# Check current fragmentation -oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status --cluster -w json | jq -``` - -**Remediation**: -```bash -# Defragment each etcd member (run one at a time) -oc exec -n openshift-etcd -c etcdctl -- etcdctl defrag --cluster -``` - -**Recommendations**: -- Schedule regular defragmentation during maintenance windows -- Monitor fragmentation trends over time -- Consider defragmentation when >30% fragmented - -### Slow Disk I/O - -**Symptoms**: -- Disk-related warnings in logs (fsync, fdatasync) -- Slow apply operations -- High compaction times (>500ms) - -**Investigation**: -```bash -# Check disk performance on etcd nodes -oc debug node/ -- chroot /host fio --name=test --rw=write --bs=4k --size=1G --direct=1 -``` - -**Recommendations**: -- Use SSD or NVMe storage for etcd -- Ensure dedicated disks for etcd (not shared with OS) -- Check for disk saturation or competing I/O -- Verify disk benchmarks meet etcd requirements (> 50 sequential IOPS) - -### Frequent Leader Changes - -**Symptoms**: Multiple leader change events in logs - -**Investigation**: -```bash -# Test network latency between control plane nodes -oc debug node/ -- ping - -# Check for network packet loss -oc debug node/ -- ping -c 100 -``` - -**Recommendations**: -- Ensure etcd nodes are in same datacenter/availability zone -- Check for network congestion or packet loss -- Verify MTU settings across cluster network -- Review network firewall rules and QoS settings - -### Large Database Size - -**Symptoms**: -- Database size >8GB -- Slow operations -- High memory usage - -**Investigation**: -```bash -# Check database size across cluster -oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status --cluster -w table -``` - -**Remediation**: -```bash -# Check event retention settings -oc get kubeapiserver cluster -o yaml | grep -A5 eventTTL - -# Review compaction settings -oc logs -n openshift-etcd -c etcd | grep compaction -``` - -**Recommendations**: -- Review event retention policies -- Consider more frequent compaction -- Check for key churn and unnecessary data -- Monitor database growth trends - -## Security Considerations - -- Metrics may expose cluster operational details -- Requires cluster-admin permissions -- Log analysis may contain sensitive data -- Performance data should be treated as confidential - -## See Also - -- Etcd performance guide: https://etcd.io/docs/latest/tuning/ -- OpenShift etcd docs: https://docs.openshift.com/container-platform/latest/scalability_and_performance/recommended-performance-scale-practices/ -- Related commands: `/etcd:health-check` - -## Notes - -- This command uses `etcdctl` and log analysis rather than direct metrics endpoint access -- Performance thresholds are based on etcd upstream recommendations -- Disk benchmarks should show > 50 sequential IOPS for etcd -- Network latency < 50ms recommended between members -- Analysis is point-in-time; trends require repeated checks over time -- Compatible with etcd 3.5+ (OpenShift 4.x) -- Log analysis window can be adjusted with `--duration` parameter -- For production clusters, consider running during low-traffic periods -- Health check latency is measured by actual proposal commits to the cluster diff --git a/plugins/etcd/commands/health-check.md b/plugins/etcd/commands/health-check.md deleted file mode 100644 index cf52e1bbf..000000000 --- a/plugins/etcd/commands/health-check.md +++ /dev/null @@ -1,460 +0,0 @@ ---- -description: Check etcd cluster health, member status, and identify issues -argument-hint: "[--verbose]" ---- - -## Name -etcd:health-check - -## Synopsis -``` -/etcd:health-check [--verbose] -``` - -## Description - -The `health-check` command performs a comprehensive health check of the etcd cluster in an OpenShift environment. It examines etcd member status, cluster health, leadership, connectivity, and identifies potential issues that could affect cluster stability. - -Etcd is the critical key-value store that holds all cluster state for Kubernetes/OpenShift. Issues related to etcd can cause cluster-wide failures, so monitoring its health is essential. - -This command is useful for: -- Diagnosing cluster control plane issues -- Verifying etcd cluster stability -- Identifying split-brain scenarios -- Checking member synchronization -- Detecting disk space issues -- Monitoring etcd performance - -## Prerequisites - -Before using this command, ensure you have: - -1. **OpenShift CLI (oc)** - - Install from: https://mirror.openshift.com/pub/openshift-v4/clients/ocp/ - - Verify with: `oc version` - -2. **Active cluster connection** - - Must be connected to an OpenShift cluster - - Verify with: `oc whoami` - -3. **Cluster admin permissions** - - Required to access etcd pods and execute commands - - Verify with: `oc auth can-i get pods -n openshift-etcd` - -4. **Healthy etcd namespace** - - The openshift-etcd namespace must exist - - At least one etcd pod must be running - -## Arguments - -- **--verbose** (optional): Enable detailed output - - Shows etcd member details - - Displays performance metrics - - Includes log snippets for errors - - Provides additional diagnostic information - -## Implementation - -The command performs the following checks: - -### 1. Verify Prerequisites - -Check if oc CLI is available and cluster is accessible: - -```bash -if ! command -v oc &> /dev/null; then - echo "Error: oc CLI not found. Please install OpenShift CLI." - exit 1 -fi - -if ! oc whoami &> /dev/null; then - echo "Error: Not connected to an OpenShift cluster." - exit 1 -fi -``` - -### 2. Check Etcd Namespace and Pods - -Verify the etcd namespace exists and get pod status: - -```bash -echo "Checking etcd namespace and pods..." - -if ! oc get namespace openshift-etcd &> /dev/null; then - echo "CRITICAL: openshift-etcd namespace not found" - exit 1 -fi - -# Get etcd pod status -ETCD_PODS=$(oc get pods -n openshift-etcd -l app=etcd -o json) -TOTAL_PODS=$(echo "$ETCD_PODS" | jq '.items | length') -RUNNING_PODS=$(echo "$ETCD_PODS" | jq '[.items[] | select(.status.phase == "Running")] | length') - -echo "Etcd pods: $RUNNING_PODS/$TOTAL_PODS running" - -if [ "$RUNNING_PODS" -eq 0 ]; then - echo "CRITICAL: No etcd pods are running" - exit 1 -fi - -# List all etcd pods with status -echo "" -echo "Etcd Pod Status:" -oc get pods -n openshift-etcd -l app=etcd -o custom-columns=NAME:.metadata.name,STATUS:.status.phase,READY:.status.containerStatuses[0].ready,RESTARTS:.status.containerStatuses[0].restartCount,NODE:.spec.nodeName -``` - -### 3. Check Etcd Cluster Health - -Use etcdctl to check cluster health from each running etcd pod: - -```bash -echo "" -echo "Checking etcd cluster health..." - -# Get the first running etcd pod -ETCD_POD=$(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') - -if [ -z "$ETCD_POD" ]; then - echo "CRITICAL: No running etcd pod found" - exit 1 -fi - -# Check cluster health -HEALTH_OUTPUT=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint health --cluster -w table 2>&1) - -if echo "$HEALTH_OUTPUT" | grep -q "is healthy"; then - echo "Cluster Health Status:" - echo "$HEALTH_OUTPUT" -else - echo "CRITICAL: Etcd cluster health check failed" - echo "$HEALTH_OUTPUT" - exit 1 -fi -``` - -### 4. Check Etcd Member List - -List all etcd members and verify quorum: - -```bash -echo "" -echo "Checking etcd member list..." - -MEMBER_LIST=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w table 2>&1) - -echo "Etcd Members:" -echo "$MEMBER_LIST" - -# Count members -MEMBER_COUNT=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl member list -w json 2>/dev/null | jq '.members | length') - -echo "" -echo "Total members: $MEMBER_COUNT" - -if [ "$MEMBER_COUNT" -lt 3 ]; then - echo "WARNING: Etcd cluster has less than 3 members (quorum at risk)" -fi - -# Check for unstarted members -UNSTARTED=$(echo "$MEMBER_LIST" | grep "unstarted" | wc -l) -if [ "$UNSTARTED" -gt 0 ]; then - echo "WARNING: $UNSTARTED member(s) in unstarted state" -fi -``` - -### 5. Check Etcd Leadership - -Verify there is a healthy leader: - -```bash -echo "" -echo "Checking etcd leadership..." - -ENDPOINT_STATUS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w table 2>&1) - -echo "Endpoint Status:" -echo "$ENDPOINT_STATUS" - -# Check if there's a leader -if echo "$ENDPOINT_STATUS" | grep -q "true"; then - LEADER_ENDPOINT=$(echo "$ENDPOINT_STATUS" | grep "true" | awk '{print $2}') - echo "" - echo "Leader: $LEADER_ENDPOINT" -else - echo "CRITICAL: No etcd leader elected" - exit 1 -fi -``` - -### 6. Check Etcd Database Size - -Check database size and fragmentation: - -```bash -echo "" -echo "Checking etcd database size..." - -# Get database size from endpoint status -DB_SIZE=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcdctl -- etcdctl endpoint status --cluster -w json 2>/dev/null) - -echo "$DB_SIZE" | jq -r '.[] | "Endpoint: \(.Endpoint) | DB Size: \(.Status.dbSize) bytes | DB Size in Use: \(.Status.dbSizeInUse) bytes"' - -# Calculate fragmentation percentage -echo "$DB_SIZE" | jq -r '.[] | - if .Status.dbSize > 0 then - "Fragmentation: \(((.Status.dbSize - .Status.dbSizeInUse) * 100 / .Status.dbSize) | floor)%" - else - "Fragmentation: N/A" - end' - -# Warn if database is too large -MAX_DB_SIZE=$((8 * 1024 * 1024 * 1024)) # 8GB threshold -CURRENT_SIZE=$(echo "$DB_SIZE" | jq -r '.[0].Status.dbSize') - -if [ "$CURRENT_SIZE" -gt "$MAX_DB_SIZE" ]; then - echo "WARNING: Etcd database size ($CURRENT_SIZE bytes) exceeds recommended maximum (8GB)" - echo "Consider defragmentation or checking for excessive key growth" -fi -``` - -### 7. Check Disk Space on Etcd Nodes - -Verify disk space on nodes running etcd: - -```bash -echo "" -echo "Checking disk space on etcd nodes..." - -for pod in $(oc get pods -n openshift-etcd -l app=etcd --field-selector=status.phase=Running -o jsonpath='{.items[*].metadata.name}'); do - echo "Pod: $pod" - oc exec -n openshift-etcd "$pod" -c etcd -- df -h /var/lib/etcd | tail -1 - - # Get disk usage percentage - DISK_USAGE=$(oc exec -n openshift-etcd "$pod" -c etcd -- df -h /var/lib/etcd | tail -1 | awk '{print $5}' | sed 's/%//') - - if [ "$DISK_USAGE" -gt 80 ]; then - echo "WARNING: Disk usage on $pod is ${DISK_USAGE}% (threshold: 80%)" - fi - echo "" -done -``` - -### 8. Check for Recent Etcd Errors - -Check recent logs for errors or warnings: - -```bash -echo "" -echo "Checking recent etcd logs for errors..." - -RECENT_ERRORS=$(oc logs -n openshift-etcd "$ETCD_POD" -c etcd --tail=100 | grep -i "error\|warn\|fatal" | tail -10) - -if [ -n "$RECENT_ERRORS" ]; then - echo "Recent errors/warnings found:" - echo "$RECENT_ERRORS" -else - echo "No recent errors in etcd logs" -fi -``` - -### 9. Check Etcd Performance Metrics (if --verbose) - -If verbose mode is enabled, check performance metrics: - -```bash -if [ "$VERBOSE" = "true" ]; then - echo "" - echo "Checking etcd performance metrics..." - - # Get metrics from etcd pod - METRICS=$(oc exec -n openshift-etcd "$ETCD_POD" -c etcd -- curl -s http://localhost:2379/metrics 2>/dev/null) - - # Parse key metrics - echo "Backend Commit Duration (p99):" - echo "$METRICS" | grep "etcd_disk_backend_commit_duration_seconds" | grep "quantile=\"0.99\"" | head -1 - - echo "" - echo "WAL Fsync Duration (p99):" - echo "$METRICS" | grep "etcd_disk_wal_fsync_duration_seconds" | grep "quantile=\"0.99\"" | head -1 - - echo "" - echo "Leader Changes:" - echo "$METRICS" | grep "etcd_server_leader_changes_seen_total" | head -1 -fi -``` - -### 10. Generate Summary Report - -Create a summary of findings: - -```bash -echo "" -echo "===============================================" -echo "Etcd Health Check Summary" -echo "===============================================" -echo "Check Time: $(date)" -echo "Cluster: $(oc whoami --show-server)" -echo "" -echo "Results:" -echo " Etcd Pods Running: $RUNNING_PODS/$TOTAL_PODS" -echo " Cluster Members: $MEMBER_COUNT" -echo " Leader Elected: Yes" -echo " Cluster Health: Healthy" -echo "" - -if [ "$WARNINGS" -gt 0 ]; then - echo "Status: WARNING - Found $WARNINGS warnings requiring attention" - exit 0 -else - echo "Status: HEALTHY - All checks passed" - exit 0 -fi -``` - -## Return Value - -The command returns different exit codes: - -- **Exit 0**: Etcd cluster is healthy (may have warnings) -- **Exit 1**: Critical issues detected (no running pods, no leader, health check failed) - -**Output Format**: -- Human-readable report with section headers -- Critical issues marked with "CRITICAL:" -- Warnings marked with "WARNING:" -- Success indicators for healthy checks - -## Examples - -### Example 1: Basic health check -``` -/etcd:health-check -``` - -Output: -``` -Checking etcd namespace and pods... -Etcd pods: 3/3 running - -Etcd Pod Status: -NAME STATUS READY RESTARTS NODE -etcd-ip-10-0-21-125.us-east-2... Running true 0 ip-10-0-21-125 -etcd-ip-10-0-43-249.us-east-2... Running true 0 ip-10-0-43-249 -etcd-ip-10-0-68-109.us-east-2... Running true 0 ip-10-0-68-109 - -Checking etcd cluster health... -Cluster Health Status: -+------------------------------------------+--------+ -| ENDPOINT | HEALTH | -+------------------------------------------+--------+ -| https://10.0.21.125:2379 | true | -| https://10.0.43.249:2379 | true | -| https://10.0.68.109:2379 | true | -+------------------------------------------+--------+ - -Checking etcd member list... -Etcd Members: -+------------------+---------+------------------------+ -| ID | STATUS | NAME | -+------------------+---------+------------------------+ -| 3a2b1c4d5e6f7890 | started | ip-10-0-21-125 | -| 4b3c2d5e6f708901 | started | ip-10-0-43-249 | -| 5c4d3e6f70890123 | started | ip-10-0-68-109 | -+------------------+---------+------------------------+ - -Total members: 3 - -Checking etcd leadership... -Leader: https://10.0.21.125:2379 - -=============================================== -Etcd Health Check Summary -=============================================== -Status: HEALTHY - All checks passed -``` - -### Example 2: Verbose health check with metrics -``` -/etcd:health-check --verbose -``` - -## Common Issues and Remediation - -### No Etcd Leader - -**Symptoms**: Cluster shows no leader elected - -**Investigation**: -```bash -oc logs -n openshift-etcd -c etcd | grep -i "leader" -oc get events -n openshift-etcd -``` - -**Remediation**: -- Check network connectivity between etcd members -- Verify etcd pods are running on different nodes -- Check for clock skew between nodes - -### High Database Size - -**Symptoms**: Database size exceeds 8GB - -**Investigation**: -```bash -oc exec -n openshift-etcd -c etcdctl -- etcdctl endpoint status -w table -``` - -**Remediation**: -- Run defragmentation: `/etcd:defrag` (if command exists) -- Check for excessive key creation (e.g., many events) -- Review retention policies - -### Disk Space Issues - -**Symptoms**: Disk usage > 80% on etcd data directory - -**Investigation**: -```bash -oc exec -n openshift-etcd -c etcd -- df -h /var/lib/etcd -``` - -**Remediation**: -- Clean up old snapshots -- Defragment database -- Increase disk size if needed - -### Member Not Started - -**Symptoms**: Member shows "unstarted" status - -**Investigation**: -```bash -oc logs -n openshift-etcd -c etcd -oc describe pod -n openshift-etcd -``` - -**Remediation**: -- Check pod logs for errors -- Verify certificates are valid -- Check network policies and firewall rules - -## Security Considerations - -- Requires cluster-admin or equivalent permissions -- Access to etcd data allows viewing all cluster secrets -- Etcd metrics may contain sensitive information -- Always use secure connections when accessing etcd - -## See Also - -- Etcd documentation: https://etcd.io/docs/ -- OpenShift etcd docs: https://docs.openshift.com/container-platform/latest/backup_and_restore/control_plane_backup_and_restore/ -- Related commands: `/etcd:analyze-performance` - -## Notes - -- This command is read-only and does not modify etcd -- Checks are performed from within etcd pods using etcdctl -- Some checks require etcd to be running -- Performance may vary on large clusters with many keys -- Database size recommendations are based on upstream etcd guidance diff --git a/plugins/git/commands/cherry-pick-by-patch.md b/plugins/git/commands/cherry-pick-by-patch.md deleted file mode 100644 index c2cbcd0f4..000000000 --- a/plugins/git/commands/cherry-pick-by-patch.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -argument-hint: -description: Cherry-pick git commit into current branch by "patch" command ---- - -## Name -git:cherry-pick-by-patch - -## Synopsis -``` -/git:cherry-pick-by-patch commit_hash -``` - -## Description - -The `/git-cherry-pick-by-patch commit_hash` command cherry-picks commit with hash -`commit_hash` into current branch. Rather then doing `git cherry-pick commit_hash`, -the command streams the output of `git show commit_hash` to -`patch -p1 --no-backup-if-mismatch`, and then commit changes with commit message -from `commit_hash` commit. - -## Implementation - -### Pre-requisites - -The commit with hash `commit_hash` must exist. To verify that use: -```bash -git show commit_hash -``` -and check if exit code is zero. - -Fail, if there is no `commit_hash` in the current repository checkout. - -### Cherry-pick `commit_hash` into current branch - -1. Execute command - ```bash - git show commit_hash | patch -p1 --no-backup-if-mismatch - ``` -and check if exit code is zero. Fail if exit code is not zero. - -2. Find files removed from local checkout by the patch command and execute `git rm` for them. - -3. Find files added or modified by the patch command and execute `git add` for them. - -4. Commit changes by `git commit` command and use commit title and description from `commit_hash` commit. - -## Arguments - -- **$1** (required): Commit hash (e.g., `902409c0`) of commit to cherry-pick. diff --git a/plugins/git/commands/fix-cherrypick-robot-pr.md b/plugins/git/commands/fix-cherrypick-robot-pr.md deleted file mode 100644 index 8f7c6313a..000000000 --- a/plugins/git/commands/fix-cherrypick-robot-pr.md +++ /dev/null @@ -1,265 +0,0 @@ ---- -description: Fix a cherrypick-robot PR that needs manual intervention -argument-hint: [error-messages] ---- - -## Name -git:fix-cherrypick-robot-pr - -## Synopsis -``` -/git:fix-cherrypick-robot-pr [error-messages] -``` - -## Description - -The `git:fix-cherrypick-robot-pr` command replaces a cherrypick-robot PR with a clean, manually-crafted cherry-pick PR that includes fixes the robot cannot handle. - -The cherrypick-robot creates automated PRs but cannot: -- Fix verification failures (JSON validation, missing annotations) -- Resolve merge conflicts -- Add context-specific fixes -- Handle edge cases requiring human judgment -- Apply repository-specific cleanup - -This command helps you create a replacement PR with all necessary fixes applied. - -## Implementation - -### 1. Extract Information from the Robot PR - -Use `gh pr view ` to extract: -- Base branch (e.g., `release-4.19`) -- PR title (to extract bug ID like `OCPBUGS-65944`) -- All commit hashes included in the PR -- PR number for later closure -- Current PR checks/CI status - -Example: -```bash -gh pr view --json baseRefName,title,commits,number,statusCheckRollup -``` - -### 2. Analyze Error Messages - -Parse the provided error output to identify: -- Root causes (JSON validation, missing annotations, conflicts, etc.) -- Affected files -- Required fixes -- Fix strategy - -**Error sources (in priority order):** -1. User-provided error messages (from command arguments) -2. File path if provided (e.g., `/path/to/ci-errors.log`) -3. CI failure URL if provided -4. Automatically fetch from PR status checks - -### 3. Discover Git Remotes and Create Branch - -```bash -# Discover the upstream remote (the main repository) -# Look for a remote that's not owned by the current user -UPSTREAM_REMOTE=$(git remote -v | grep "fetch" | grep -v "$(git config user.name)" | awk '{print $1}' | head -1) - -# Discover the fork remote (your fork) -FORK_REMOTE=$(git remote -v | grep "$(git config user.name).*push" | awk '{print $1}' | head -1) - -# If not found, fall back to common names -UPSTREAM_REMOTE=${UPSTREAM_REMOTE:-upstream} -FORK_REMOTE=${FORK_REMOTE:-origin} - -# Fetch the latest base branch -git fetch $UPSTREAM_REMOTE - -# Create new branch following naming convention -git checkout -b cherry-pick--to- $UPSTREAM_REMOTE/ -``` - -Example branch name: `cherry-pick-12345-to-release-1.0` - -### 4. Cherry-Pick Commits - -Cherry-pick all commits from the robot PR in order: - -```bash -# For each commit hash extracted from the robot PR -git cherry-pick - -# OR use the cherry-pick-by-patch command -/git:cherry-pick-by-patch -``` - -Handle any conflicts that arise during cherry-picking. - -### 5. Apply Necessary Fixes Based on Errors - -Based on the error analysis from step 2, apply the necessary fixes: - -**Analyze the errors to determine:** -1. Which files are causing failures -2. What type of failure (validation, conflict, test, build) -3. What fix strategy is appropriate for the repository - -**Common fix strategies:** - -- **Validation failures**: Check if files can be excluded from validation or need correction -- **Generated file mismatches**: Run repository update/regeneration scripts (e.g., `make update`, `make generate`) -- **Merge conflicts**: Resolve conflicts by reviewing both sides and understanding the target branch context -- **Test failures**: Update tests to be compatible with the target branch -- **Build failures**: Update dependencies or build configuration for the target branch - -**Apply fixes with clear commits:** -```bash -# Make necessary changes based on error analysis -# Stage and commit each logical fix separately -git add -git commit -m "" -``` - -**Note**: The specific fix commands will vary by repository. Consult the repository's documentation for: -- Verification script locations and options -- Code generation/update commands -- Testing conventions -- Contribution guidelines - -### 6. Push and Create Replacement PR - -```bash -# Use the discovered fork remote (from step 3) -# If running this step separately, rediscover the fork remote: -FORK_REMOTE=$(git remote -v | grep "$(git config user.name).*push" | awk '{print $1}' | head -1) -FORK_REMOTE=${FORK_REMOTE:-origin} - -# Push to your fork -git push -u $FORK_REMOTE cherry-pick--to- - -# Create PR using gh CLI -gh pr create \ - --base \ - --title "[] : " \ - --body "$(cat <<'EOF' -## Summary -Cherry-pick of to with manual fixes. - -## Commits -- : -- : - -## Fixes Applied -- -- - -## References -- Original PR: # -- JIRA: - -🤖 Generated with [Claude Code](https://claude.com/claude-code) - -Co-Authored-By: Claude -EOF -)" -``` - -### 7. Close the Old Robot PR - -Add a comment to the robot PR explaining the closure: - -```bash -gh pr comment --body "Closing this PR in favor of # which includes the following fixes: -- -- - -/close" -``` - -The `/close` command triggers the bot to close the PR. - -## Return Value - -- **Success**: New PR URL and confirmation that old PR is closed -- **Failure**: Error message with specific issue encountered - -## Examples - -### Example 1: With Error Messages Pasted Directly - -``` -/git:fix-cherrypick-robot-pr https://github.com/org/repo/pull/12345 - -Error messages: -[paste CI error output here] -``` - -**The command will:** -1. Extract PR information (base branch, commits, bug ID) -2. Analyze the error messages to identify failure types -3. Cherry-pick commits to a new branch -4. Guide you through applying appropriate fixes based on repository conventions -5. Create a new PR with fixes applied -6. Close the old robot PR with explanation - -### Example 2: With Error Log File Reference - -``` -/git:fix-cherrypick-robot-pr https://github.com/org/repo/pull/12345 - -Error log file: /path/to/ci-errors.log -``` - -The command reads the error log file and processes it the same way as Example 1. - -### Example 3: With CI Failure Page Link - -``` -/git:fix-cherrypick-robot-pr https://github.com/org/repo/pull/12345 - -CI failure: https://ci-system.example.com/logs/... -``` - -The command fetches the CI logs from the provided URL and analyzes them. - -### Example 4: No Error Messages (Auto-detect) - -``` -/git:fix-cherrypick-robot-pr https://github.com/org/repo/pull/12345 -``` - -If no error messages are provided, the command will: -1. Check PR status using `gh pr view` -2. Identify failing checks -3. Fetch CI logs automatically -4. Analyze and fix based on detected issues - -## Arguments - -- **$1** (required): PR URL - The URL of the cherrypick-robot PR to fix (e.g., `https://github.com/org/repo/pull/12345`) -- **$2** (optional): Error messages - Can be: - - Error messages pasted directly - - File path to error log (e.g., `/path/to/ci-errors.log`) - - CI failure page URL - - Omitted (will auto-detect from PR status) - -## Common Issues This Handles - -Beyond what the robot can do: -- ✅ **Validation errors** - Apply exclusions or corrections based on repository conventions -- ✅ **Generated file mismatches** - Run appropriate update/regeneration commands -- ✅ **Merge conflicts** - Resolve using context -- ✅ **Test failures** - Update tests for target branch compatibility -- ✅ **Build failures** - Update dependencies or configuration -- ✅ **Context-specific fixes** - Apply fixes appropriate for the target branch -- ✅ **Edge cases** - Handle with human judgment - -## Notes - -- Works with cherrypick-robot PRs across different repositories -- Error messages help determine exactly what to fix -- Automatically discovers git remote names (no hardcoded assumptions) -- All changes pushed to your fork (auto-discovered remote) -- New PRs target the upstream repository -- Branch naming convention: `cherry-pick--to-` -- Maintains full control to add any fixes needed -- If no error messages provided, will check PR status and CI logs automatically -- Remote discovery uses `git remote -v` and `git config user.name` to identify fork vs upstream -- Falls back to common names (`origin` for fork, `upstream` for main repo) if auto-discovery fails -- Fix strategies will vary by repository - consult repository documentation for specific commands diff --git a/plugins/git/commands/summary.md b/plugins/git/commands/summary.md deleted file mode 100644 index 31664b9c2..000000000 --- a/plugins/git/commands/summary.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -description: Show current branch, git status, and recent commits for quick context -argument-hint: ---- - -## Name -git:summary - -## Synopsis -``` -/git:summary -``` - -## Description -The `git:summary` command provides a comprehensive overview of the current Git repository state. It displays the current branch, tracking status, working tree status, and recent commit history in a single concise view. This command is designed to give developers quick context about their repository without running multiple Git commands manually. - -It provides essential information for developers including: -- Current branch and remote tracking status (ahead/behind) -- Working tree status (modified, staged, and untracked files) -- Recent commit history with one-line summaries -- Uncommitted changes summary - -The spec sections is inspired by https://man7.org/linux/man-pages/man7/man-pages.7.html#top_of_page - -## Implementation -- Executes multiple git commands to gather repository state -- Retrieves current branch name and tracking information -- Shows git status for modified, staged, and untracked files -- Displays last 5 commits with one-line summaries -- Summarizes uncommitted changes -- Formats output for clear readability -- All information is read-only with no side effects - -Implementation logic: -```bash -# Get current branch and tracking status -git branch -vv - -# Show working tree status -git status --short - -# Display recent commits -git log --oneline -5 - -# Summarize uncommitted changes -git diff --stat -``` - -## Return Value -- **Claude agent text**: Formatted summary including: - - Current branch name and remote tracking status - - List of modified, staged, and untracked files - - Last 5 commit messages with hashes - - Statistics of uncommitted changes - -## Examples - -1. **Basic usage**: - ``` - /git:summary - ``` - Output: - ``` - Current branch: main - Your branch is up to date with 'origin/main'. - - Modified files: - M src/index.ts - ?? temp/ - - Recent commits: - abc123 Fix authentication bug - def456 Add user profile feature - ghi789 Update dependencies - jkl012 Refactor database layer - mno345 Initial commit - - Uncommitted changes: - 1 file changed, 15 insertions(+), 3 deletions(-) - ``` - -2. **Repository with no changes**: - ``` - /git:summary - ``` - Output: - ``` - Current branch: develop - Your branch is up to date with 'origin/develop'. - - Working tree clean - - Recent commits: - pqr678 Merge pull request #42 - stu901 Add test coverage - vwx234 Fix linting issues - yza567 Update README - bcd890 Release v2.0.0 - ``` - -## Arguments: -- None diff --git a/plugins/golang/.claude-plugin/plugin.json b/plugins/golang/.claude-plugin/plugin.json deleted file mode 100644 index d7fab6f42..000000000 --- a/plugins/golang/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "golang", - "description": "Run golang codebase related commands and tools", - "version": "0.1.0", - "author": { - "name": "github.com/openshift-eng" - } -} \ No newline at end of file diff --git a/plugins/golang/README.md b/plugins/golang/README.md deleted file mode 100644 index bb6423357..000000000 --- a/plugins/golang/README.md +++ /dev/null @@ -1,79 +0,0 @@ -# Golang Plugin - -A Claude Code plugin for running [golangci-lint](https://golangci-lint.run/) to check and fix code quality issues in Go projects. - - -## Installation - -```bash -/plugin install golang@ai-helpers -``` - -## Commands - -| Command | Description | -|---------|-------------| -| `/golang:lint-fix` | Run golangci-lint and automatically fix all reported issues | - -## Skills - -| Skill | Description | -|-------|-------------| -| Go Lint | Detects and runs golangci-lint using the best available method for the repository. Loaded automatically when linting is relevant, and used by both commands above. | - -## Prerequisites - -- go compiler (available in $PATH) - -- **Optional**: `make lint` target configured in Makefile, if not present the command will run `golangci-lint` directly. - -## Recommended Permissions - -To allow Claude Code to run the linter commands without prompting for approval, add the following to your project's `.claude/settings.json`: - -```json -{ - "permissions": { - "allow": [ - "Bash(curl -s https://api.github.com/repos/golangci/golangci-lint/releases/latest)", - "Bash(make golangci-lint:*)", - "Bash(./bin/golangci-lint:*)", - "Bash(GOBIN=/tmp go install:*)", - "Bash(/tmp/golangci-lint:*)", - "Bash(make lint:*)", - "Bash(make lint)" - ] - } -} -``` - -### What these permissions allow: - -| Permission | Purpose | -|------------|---------| -| `curl ... golangci-lint/releases/latest` | Check for latest golangci-lint version | -| `make golangci-lint:*` | Run make targets for golangci-lint installation | -| `./bin/golangci-lint:*` | Run golangci-lint from project's bin directory | -| `GOBIN=/tmp go install:*` | Install golangci-lint to /tmp for temporary use | -| `/tmp/golangci-lint:*` | Run golangci-lint from /tmp | -| `make lint:*`, `make lint` | Run the standard `make lint` target | - -## Usage - -### `/golang:lint`: check for linter issues - -This will: -1. Run golangci-lint using available methods (via the "Go Lint" skill) -2. Report total number of issues found -3. Summarize issues by category (goconst, gocyclo, staticcheck, etc.) -4. Show example issues - -The "Go Lint" skill is also loaded automatically when the agent detects that linting is needed (e.g., the user says "run the linter" or "check for lint issues"). - -### `/golang:lint-fix`: check for linter issues and fix them - -This will: -1. Run the "Go Lint" skill to identify all issues -2. Systematically fix each category of issues -3. Re-run linter after each fix to verify -4. Continue until all issues are resolved diff --git a/plugins/golang/commands/lint-fix.md b/plugins/golang/commands/lint-fix.md deleted file mode 100644 index 0493954ec..000000000 --- a/plugins/golang/commands/lint-fix.md +++ /dev/null @@ -1,106 +0,0 @@ ---- -description: Run golangci-lint tool and fix all reported issues ---- - -## Name -golang:lint-fix - -## Synopsis -``` -/golang:lint-fix -/golang:lint-fix [] -``` - -## Description -This command runs golangci-lint and systematically fixes all reported issues in the codebase. It creates a todo list to track progress and fixes issues by category until all linter checks pass. - -This command handles common linter categories including goconst, gocyclo, prealloc, revive, staticcheck, and unparam with appropriate fix strategies for each. - -## Implementation - -Follow this process: - -1. **Run the "Go Lint" skill** to identify all issues, - extract the `golangci-lint` exact call from the skill and remember it. - Pass through any additional flags the user provided. - -2. With the exact `golangci-lint` command, run with `--fix` appended, - to automatically fix any auto-fixable issues. - - **Note**: If the "Go Lint" skill used `hack/go-lint.sh` (a containerized script), the `--fix` flag cannot be passed to it. In this case, run `golangci-lint run --fix` directly (using the same config file if one exists in the repo, e.g., `--config=.golangci.yaml`). - -3. For the rest of the issues, **create a todo list** to track fixing each category of issues - -4. **Fix all issues systematically** using these example strategies: - - **goconst**: Add constants for repeated strings (3+ occurrences) - - **gocyclo**: Add `//nolint:gocyclo` comments for complex test functions with justification - - **prealloc**: Pre-allocate slices when capacity is known using `make([]T, 0, capacity)` - - **revive**: Fix comment spacing issues (add space after `//`) - - **staticcheck**: Fix deprecated code, remove redundant checks, fix naming conventions (ErrFoo for errors) - - **unparam**: Remove unused parameters or always-nil error returns - -5. **Re-run exact `golangci-lint` command again** after each category to verify fixes - -6. **Continue until all issues are resolved** - -### Important Guidelines - -- For test files with high cyclomatic complexity, add `//nolint:gocyclo` with reason "Table-driven test with inherent complexity" -- For generated files, add `//nolint` comments rather than modifying the code -- For Ginkgo/Gomega dot imports, add `//nolint:staticcheck,revive` with reason "Ginkgo/Gomega DSL convention" -- When creating constants, check if one already exists in the package before adding a new one -- Use existing constants from other packages when appropriate -- For functions that always return nil error, remove the error return and update all callers -- For unused parameters, either remove them or add `//nolint:unparam` if they're needed for interface compatibility - -### Final Step - -Run the "Go Lint" skill one last time to confirm all issues are resolved (0 issues). - -## Return Value -- **Format**: Progress updates and final confirmation -- **Success**: Confirmation that all linter issues are resolved with 0 issues remaining -- **Partial**: List of remaining issues if some could not be automatically fixed - -## Examples - -1. **Basic usage**: - ```text - /golang:lint-fix - ``` - Output: - ```text - Running make lint... Found 23 issues - - Creating todo list: - ☐ Fix goconst issues (8) - ☐ Fix staticcheck issues (7) - ☐ Fix gocyclo issues (4) - ☐ Fix revive issues (4) - - Fixing goconst issues... - ✓ Added constant APIContentType for "application/json" - ✓ Added constant DefaultTimeout for "30s" - ... - - Running make lint... 15 issues remaining - ... - - ✓ All linter issues resolved (0 issues) - ``` - -2. **Already clean codebase**: - ```text - /golang:lint-fix - - ``` - Output: - ```text - Running make lint... 0 issues found - ✓ Code already passes all linter checks - ``` - -## Arguments - -- **$1** (flags): Optional. Arbitrary flags to be passed to the golangci-lint utility. - - These flags are passed through to the "Go Lint" skill and chained onto the `golangci-lint` invocation. - diff --git a/plugins/golang/skills/lint/SKILL.md b/plugins/golang/skills/lint/SKILL.md deleted file mode 100644 index a7fba011b..000000000 --- a/plugins/golang/skills/lint/SKILL.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -name: Go Lint -description: Detect and run golangci-lint in a Go repository using the best available method ---- - -# Go Lint - -This skill detects the best way to run golangci-lint in the current Go repository and executes it, reporting results in a structured summary. - -## When to Use This Skill - -Use this skill when: -- The user asks to run the linter, check for lint issues, or verify code quality in a Go project -- Another command needs to run golangci-lint as a prerequisite step (e.g., `golang:lint-fix`) -- The agent decides linting is needed before committing Go code changes - -## Prerequisites - -- A Go repository (contains `go.mod`) -- `golangci-lint` installed, or the ability to install it (see Step 6 below) - -## Implementation Steps - -### Step 1: Detect the Lint Command - -Try the following approaches in order. Proceed to Step 2 once any approach succeeds: - -1. **Check project documentation first** - Read `AGENTS.md` or `CLAUDE.md` in the repository root (if they exist) and look for linting instructions (e.g., `make lint`, `make verify`, specific golangci-lint commands, or other linter commands). If found, use those instructions. - -2. **Check for lint scripts** - Many repositories (especially OpenShift projects) have scripts that run golangci-lint in a containerized way with repo-specific configuration. Check for these patterns and run if found: - - `hack/go-lint.sh` - - `hack/lint.sh` - - `hack/verify-golangci-lint.sh` - - `hack/verify-lint.sh` - - `scripts/go-lint.sh` - - `scripts/lint.sh` - - Or any other `*lint*.sh` script in `hack/` or `scripts/` directories - -3. **Check the Makefile** - Look for a make target like `make lint` or `make verify-lint`, and run it. - -4. **Run golangci-lint directly** - Try: `golangci-lint run` - -5. **Try GOPATH binary** - If golangci-lint was not found on PATH, try: - - `$(go env GOPATH)/bin/golangci-lint run` - -6. **Install golangci-lint** - If not installed, inform the user how to install it: - - macOS/Linux: `curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin` - - Or using go install: `go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest` - - Then retry from step 4. - -### Step 2: Handle Configuration - -During the linter run: -- If the codebase contains an existing `.golangci.yml` or `.golangci.yaml`, use it with `golangci-lint run --config=` -- If there are issues with a missing config, try with `golangci-lint run --noconfig` - -### Step 3: Report Results - -After running the linter: -1. Report the total number of issues found -2. Summarize the issues by category (e.g., goconst, gocyclo, staticcheck, etc.) -3. Show the first 2-3 issues as examples -4. If there are no issues, confirm that the code passes all linter checks - -**Do not attempt to fix any issues** - this skill is read-only. - -## Output Format - -- **Success with issues**: - ``` - Found 15 issues: - - goconst: 5 issues - - staticcheck: 4 issues - - gocyclo: 3 issues - - revive: 3 issues - - Example issues: - - pkg/api/handler.go:42: string "application/json" has 3 occurrences (goconst) - - pkg/utils/helper.go:87: cyclomatic complexity 15 of function ProcessData (gocyclo) - ``` - -- **No issues**: - ``` - Code passes all linter checks (0 issues found) - ``` - -- **Error**: Installation instructions if golangci-lint could not be installed or run - -## Important Notes - -- Remember the exact `golangci-lint` command that was used, as callers (like `golang:lint-fix`) may need it -- If the user passes additional flags, chain them to the `golangci-lint` invocation (e.g., `--tests`, `--concurrency 4`) -- Do not run with `--fix`; if the user wants fixes, direct them to the `/golang:lint-fix` command diff --git a/plugins/gwapi/.claude-plugin/plugin.json b/plugins/gwapi/.claude-plugin/plugin.json deleted file mode 100644 index 39e7c34fd..000000000 --- a/plugins/gwapi/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "gwapi", - "description": "Gateway API management for Kubernetes/OpenShift clusters", - "version": "0.0.1", - "author": { - "name": "github.com/openshift-eng" - } -} diff --git a/plugins/gwapi/README.md b/plugins/gwapi/README.md deleted file mode 100644 index 218696e5e..000000000 --- a/plugins/gwapi/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Gateway API Plugin - -Install and configure Gateway API resources on Kubernetes and OpenShift clusters. - -## Overview - -This Gateway API plugin provides utilities for installing Gateway API resources with automatic cluster configuration. It simplifies the deployment of GatewayClass and Gateway resources by applying the appropriate configuration. - -## Commands - -### `/gwapi:install` - -Install Gateway API resources to a Kubernetes/OpenShift cluster. - -See [commands/install.md](commands/install.md) for complete documentation. - -### `/gwapi:check` - -Check the installed Gateway API resources in the connected cluster. - -See [commands/check.md](commands/check.md) for complete documentation. - -### `/gwapi:delete` - -Delete Gateway API resources in the Kubernetes/OpenShift cluster. - -See [commands/delete.md](commands/delete.md) for complete documentation. - -**Synopsis:** -```bash -/gwapi:install [namespace] -/gwapi:check [namespace] -/gwapi:delete [namespace] -``` - -**Features:** -- Automatically detects cluster ingress domain -- Installs GatewayClass and Gateway resources -- Supports both OpenShift (`oc`) and Kubernetes (`kubectl`) -- Optional namespace targeting -- Check installed Gateway API resources -- Delete Gateway API resources -- Idempotent installation (safe to run multiple times) - -## Installation - -```bash -/plugin install gwapi@ai-helpers -``` - -## Prerequisites - -- Either `oc` (OpenShift CLI) or `kubectl` (Kubernetes CLI) must be installed -- Active connection to a Kubernetes or OpenShift cluster -- Appropriate permissions to create cluster-scoped resources (GatewayClass) and namespaced resources (Gateway) - -## Resources Installed - -The plugin installs, checks and deletes the following Gateway API resources: - -1. **GatewayClass** (`openshift-default`) - - Controller: `openshift.io/gateway-controller/v1` - - Cluster-scoped resource defining the gateway implementation - -2. **Gateway** (`gateway`) - - Namespace: `openshift-ingress` (default) - - Hostname pattern: `*.gwapi.${DOMAIN}` (automatically configured) - - Listener on port 80 (HTTP) - - Allows routes from all namespaces - -## How It Works - -1. Detects available CLI tool (`oc` or `kubectl`) -2. Verifies cluster connectivity -3. Retrieves cluster ingress domain (OpenShift) or prompts for manual input (Kubernetes) -4. Applies GatewayClass resource -5. Substitutes cluster domain into Gateway resource and applies it -6. Verifies installation success -7. Checks the installed and other related Gateway API resources -8. Deletes all related resources after prompting the user - -## Notes - -- The Gateway resource uses `${DOMAIN}` as a placeholder that gets replaced with your cluster's actual ingress domain -- Resources are applied idempotently - you can run the command multiple times safely -- Original YAML files are not modified; domain substitution happens in-memory during application -- Deleting the Gateway API resources provides warnings and disclaimers diff --git a/plugins/gwapi/commands/check.md b/plugins/gwapi/commands/check.md deleted file mode 100644 index 6ac21055f..000000000 --- a/plugins/gwapi/commands/check.md +++ /dev/null @@ -1,147 +0,0 @@ ---- -description: Check Gateway API resources status in the cluster -argument-hint: "[namespace]" ---- - -## Name -gwapi:check - -## Synopsis -```bash -/gwapi:check [namespace] -``` - -## Description -The `gwapi:check` command verifies the status of Gateway API resources in a Kubernetes or OpenShift cluster. It checks: -1. Presence and status of GatewayClass resources -2. Presence and status of Gateway resources -3. Gateway listener configuration and readiness -4. Gateway addresses and connectivity - -This command helps troubleshoot Gateway API deployments and verify successful installation. - -## Arguments -- `$1` (optional): Target namespace to check for Gateway resources. If not specified, checks all namespaces for GatewayClass (cluster-scoped) and Gateway resources. - -## Implementation - -1. **Tool Detection** - - Check if `oc` is available: `which oc` - - If not available, check for `kubectl`: `which kubectl` - - If neither is available, inform the user to install one of these tools: - - OpenShift CLI: - - Kubernetes CLI: - -2. **Cluster Connection Verification** - - Verify cluster connectivity: `oc whoami` or `kubectl cluster-info` - - If connection fails, inform the user to authenticate to their cluster: - - For OpenShift: `oc login ` - - For Kubernetes: Configure kubeconfig properly - -3. **Check GatewayClass Resources** - - List all GatewayClass resources: `oc get gatewayclass` or `kubectl get gatewayclass` - - For each GatewayClass found: - - Display name, controller, and ACCEPTED status - - Get detailed status: `oc get gatewayclass -o yaml` - - Check the `status.conditions` for any errors or warnings - - If no GatewayClass found: - - Display: "No GatewayClass resources found. You may need to install Gateway API CRDs or run /gwapi:install" - -4. **Check Gateway Resources** - - If namespace argument provided: - - Check Gateway resources in specified namespace: `oc get gateway -n ` - - If no namespace argument: - - Check all namespaces: `oc get gateway --all-namespaces` - - For each Gateway found: - - Display name, namespace, class, and PROGRAMMED status - - Get detailed information: `oc get gateway -n -o yaml` - - Extract and display: - - Gateway addresses (LoadBalancer IPs/hostnames) - - Listener configurations (hostnames, ports, protocols) - - Listener status and attached routes count - - Check the `status.conditions` for any errors or warnings - - If no Gateway found: - - Display: "No Gateway resources found in [namespace/cluster]" - -5. **Status Summary** - - Create a summary report with: - - Total GatewayClass count and their statuses - - Total Gateway count per namespace - - Number of ready vs not-ready Gateways - - Any errors or warnings found - -6. **Connectivity Check (Optional)** - - For each Gateway with an address: - - Display the address (LoadBalancer hostname/IP) - - Suggest testing connectivity: `curl -v http://` - - Note: Actual connectivity testing is optional and should be suggested rather than automatically performed - -7. **Error Handling** - - If API resources not found: - - Display: "Gateway API CRDs not installed. Install them using /gwapi:install or manually install Gateway API CRDs" - - If access denied: - - Display: "Insufficient permissions. GatewayClass requires cluster-scoped read access, Gateway requires namespace read access" - - If cluster unreachable: - - Display connection error and suggest checking cluster status - -## Return Value -- **Success**: Status report showing all Gateway API resources and their health -- **No Resources**: Information that no Gateway API resources were found with suggestion to run /gwapi:install -- **Error**: Error message with troubleshooting steps - -## Examples - -1. **Check all Gateway API resources**: - ```bash - /gwapi:check - ``` - Displays status of all GatewayClass and Gateway resources across the cluster. - -2. **Check Gateway resources in specific namespace**: - ```bash - /gwapi:check openshift-ingress - ``` - Shows Gateway resources only in the `openshift-ingress` namespace, plus all cluster-scoped GatewayClass resources. - -## Output Format - -The command should produce output similar to: - -```text -Gateway API Status Check -======================== - -GatewayClass Resources: ------------------------ -NAME CONTROLLER ACCEPTED AGE -openshift-default openshift.io/gateway-controller/v1 True 2h - -Gateway Resources: ------------------- -NAMESPACE NAME CLASS PROGRAMMED AGE -openshift-ingress gateway openshift-default True 1h - -Gateway Details: gateway (openshift-ingress) ---------------------------------------------- -Address: a0a658ac4b2d447fa83d2f247a0dc714-1135029665.us-west-1.elb.amazonaws.com -Listeners: - - Name: demo - Hostname: *.gwapi.apps.ci-ln-42q9hck-76ef8.aws-4.ci.openshift.org - Port: 80 - Protocol: HTTP - Status: Ready - Attached Routes: 3 - -Summary: --------- -✓ 1 GatewayClass (1 accepted) -✓ 1 Gateway (1 programmed) -✓ All resources healthy -``` - -## Notes -- GatewayClass is cluster-scoped, so it's always checked regardless of namespace argument -- Gateway is namespace-scoped, filtered by namespace argument if provided -- The command is read-only and makes no modifications to the cluster -- Useful for verifying successful installation after running /gwapi:install -- Can be run repeatedly to monitor Gateway API resource health diff --git a/plugins/gwapi/commands/delete.md b/plugins/gwapi/commands/delete.md deleted file mode 100644 index a4f6a7f17..000000000 --- a/plugins/gwapi/commands/delete.md +++ /dev/null @@ -1,148 +0,0 @@ ---- -description: Delete Gateway API resources from a Kubernetes/OpenShift cluster -argument-hint: "[namespace]" ---- - -## Name -gwapi:delete - -## Synopsis -```bash -/gwapi:delete [namespace] -``` - -## Description -The `gwapi:delete` command removes Gateway API resources from a Kubernetes or OpenShift cluster. It deletes: -1. Gateway resources (namespace-scoped) -2. GatewayClass resources (cluster-scoped) - -The command uses `oc` (preferred) or `kubectl` to delete the resources safely. It provides confirmation before deletion and verifies successful removal. - -## Arguments -- `$1` (optional): Target namespace for deleting Gateway resources. If not specified, deletes Gateway resources from the `openshift-ingress` namespace (as defined in the YAML files) and the cluster-scoped GatewayClass. - -## Implementation - -1. **Tool Detection** - - Check if `oc` is available: `which oc` - - If not available, check for `kubectl`: `which kubectl` - - If neither is available, inform the user to install one of these tools: - - OpenShift CLI: - - Kubernetes CLI: - -2. **Cluster Connection Verification** - - Verify cluster connectivity: `oc whoami` or `kubectl cluster-info` - - If connection fails, inform the user to authenticate to their cluster: - - For OpenShift: `oc login ` - - For Kubernetes: Configure kubeconfig properly - -3. **Resource Discovery** - - Check for existing Gateway resources: - - If namespace argument provided: `oc get gateway -n ` - - If no namespace argument: `oc get gateway --all-namespaces` - - Check for existing GatewayClass resources: `oc get gatewayclass` - - If no resources found: - - Display: "No Gateway API resources found to delete" - - Exit successfully - -4. **Display Resources to be Deleted** - - Show a clear list of resources that will be deleted: - ```text - The following resources will be deleted: - - GatewayClass: - - openshift-default - - Gateway (openshift-ingress): - - gateway - ``` - -5. **User Confirmation** - - Ask for confirmation before proceeding with deletion - - Use AskUserQuestion tool to confirm: - - Question: "Are you sure you want to delete these Gateway API resources?" - - Options: - - "Yes, delete all resources" - - "No, cancel deletion" - - If user selects "No" or cancels, exit without making changes - -6. **Delete Gateway Resources** - - If namespace argument provided: - - Delete Gateway resources from specified namespace - - For each Gateway found: `oc delete gateway -n ` - - If no namespace argument: - - Delete the specific Gateway from the YAML: `oc delete -f plugins/gwapi/resources/gateway.yaml --ignore-not-found` - - Alternative: Delete by name if known: `oc delete gateway gateway -n openshift-ingress --ignore-not-found` - - Display deletion status for each Gateway - - Use `--ignore-not-found` flag to handle already-deleted resources gracefully - -7. **Delete GatewayClass Resources** - - Delete the GatewayClass resource: `oc delete -f plugins/gwapi/resources/gatewayclass.yaml --ignore-not-found` - - Alternative: Delete by name: `oc delete gatewayclass openshift-default --ignore-not-found` - - Display deletion status - - Note: GatewayClass is cluster-scoped, so namespace argument doesn't apply - -8. **Deletion Verification** - - Verify Gateway resources are deleted: - - If namespace was specified: `oc get gateway -n ` - - Otherwise: `oc get gateway --all-namespaces` - - Verify GatewayClass is deleted: `oc get gatewayclass` - - If resources still exist, display warning with resource names - - If all resources are deleted, display success confirmation - -9. **Error Handling** - - If deletion fails due to permissions: - - Display: "Insufficient permissions. Deleting GatewayClass requires cluster-admin privileges, Gateway requires namespace delete permissions" - - If resources are in use (have attached routes): - - Display warning about attached routes - - Show number of attached routes per Gateway - - Confirm user still wants to proceed - - If deletion partially fails: - - Display which resources were successfully deleted - - Display which resources failed with error messages - - Provide troubleshooting steps for failed deletions - -10. **Cleanup Summary** - - Display a summary of deletion results: - - Number of Gateways deleted - - Number of GatewayClasses deleted - - Any errors or warnings encountered - -## Return Value -- **Success**: Confirmation message listing all deleted resources -- **No Resources**: Information that no Gateway API resources were found -- **Partial Success**: List of successfully deleted and failed resources -- **Cancelled**: Message that deletion was cancelled by user -- **Failure**: Error message with troubleshooting steps - -## Examples - -1. **Delete all Gateway API resources**: - ```bash - /gwapi:delete - ``` - Prompts for confirmation, then deletes Gateway from `openshift-ingress` namespace and the GatewayClass. - -2. **Delete Gateway from specific namespace**: - ```bash - /gwapi:delete gateway-system - ``` - Deletes Gateway resources only from the `gateway-system` namespace and the cluster-scoped GatewayClass (after confirmation). - -## Notes -- **Destructive Operation**: This command permanently deletes resources. Always confirm before proceeding. -- **Attached Routes**: If HTTPRoute or other route resources reference the Gateway, they may become non-functional after deletion -- **Cluster-Scoped**: GatewayClass deletion requires cluster-admin or equivalent permissions -- **Idempotent**: Safe to run multiple times - uses `--ignore-not-found` flag -- **No Cascade**: Deleting GatewayClass does not automatically delete associated Gateways -- **Service Impact**: Deleting Gateway resources will stop routing traffic through the Gateway -- **Confirmation Required**: User must explicitly confirm deletion to prevent accidental resource removal -- **Resource Files**: The original YAML files in `plugins/gwapi/resources/` are not modified or deleted - -## Safety Features -- Requires explicit user confirmation before deletion -- Displays all resources to be deleted before proceeding -- Uses `--ignore-not-found` to handle already-deleted resources -- Provides clear error messages for troubleshooting -- Verifies deletion was successful -- Warns about attached routes that may be impacted diff --git a/plugins/gwapi/commands/install.md b/plugins/gwapi/commands/install.md deleted file mode 100644 index d8494031c..000000000 --- a/plugins/gwapi/commands/install.md +++ /dev/null @@ -1,208 +0,0 @@ ---- -description: Install Gateway API resources to a Kubernetes/OpenShift cluster -argument-hint: "[namespace]" ---- - -## Name -gwapi:install - -## Synopsis -```bash -/gwapi:install [namespace] -``` - -## Description -The `gwapi:install` command applies Gateway API YAML resources to a Kubernetes or OpenShift cluster. It installs: -1. `gatewayclass.yaml` - Defines the GatewayClass resource -2. `gateway.yaml` - Defines the Gateway resource with cluster-specific domain configuration - -The command automatically retrieves the cluster's ingress domain and substitutes it into the gateway.yaml before applying. It uses `oc` (preferred) or `kubectl` to install the resources. - -**The command waits for all resources to reach a successful status before completing** (up to 5 minutes timeout). This ensures that the Gateway API resources are fully reconciled and ready for use. - -## Arguments -- `$1` (optional): Target namespace for installing Gateway API resources. If not specified, uses the namespace defined in the YAML files or the current namespace context. - -## Implementation - -1. **Tool Detection** - - Check if `oc` is available: `which oc` - - If not available, check for `kubectl`: `which kubectl` - - If neither is available, inform the user to install one of these tools: - - OpenShift CLI: - - Kubernetes CLI: - -2. **Cluster Connection Verification** - - Verify cluster connectivity: `oc whoami` or `kubectl cluster-info` - - If connection fails, inform the user to authenticate to their cluster: - - For OpenShift: `oc login ` - - For Kubernetes: Configure kubeconfig properly - -3. **Retrieve Cluster Domain** - - Get the cluster's ingress domain: `DOMAIN=$(oc get ingresses.config/cluster -o jsonpath={.spec.domain})` - - If this fails (e.g., on non-OpenShift clusters), ask the user to provide the domain manually - - Verify domain is not empty: `echo $DOMAIN` - -4. **Namespace Handling** - - If namespace argument is provided: - - Check if namespace exists: `oc get namespace ` or `kubectl get namespace ` - - If it doesn't exist, create it: `oc create namespace ` or `kubectl create namespace ` - - Set context to use this namespace for subsequent commands - -5. **Install GatewayClass** - - Locate `plugins/gwapi/resources/gatewayclass.yaml` - - Display: "Installing GatewayClass..." - - Apply the resource: `oc apply -f plugins/gwapi/resources/gatewayclass.yaml` or `kubectl apply -f plugins/gwapi/resources/gatewayclass.yaml` - - Note: GatewayClass is cluster-scoped, so it does not require a namespace flag - - Capture and display any errors or warnings - -6. **Install Gateway with Domain Substitution** - - Locate `plugins/gwapi/resources/gateway.yaml` - - Display: "Installing Gateway with domain: $DOMAIN" - - Export the domain as an environment variable: `export DOMAIN=""` - - Substitute the domain in the YAML file using envsubst: `envsubst < plugins/gwapi/resources/gateway.yaml | oc apply -f -` - - If namespace argument was provided, add `-n ` flag - - Capture and display any errors or warnings - -7. **Wait for Resources to be Ready** - - Set timeout to 300 seconds (5 minutes) - - Poll every 5 seconds until resources are ready or timeout is reached - - **GatewayClass readiness check:** - - Get GatewayClass name from applied resource (e.g., `openshift-default`) - - Check ACCEPTED condition: `oc get gatewayclass -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}'` - - GatewayClass is ready when: ACCEPTED condition status is `True` - - Display progress: "Waiting for GatewayClass to be accepted... (attempt X/60)" - - **Gateway readiness check:** - - Determine namespace where Gateway was created (from YAML or argument) - - Get Gateway name from applied resource (e.g., `gateway`) - - Check PROGRAMMED condition: `oc get gateway -n -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}'` - - Check ACCEPTED condition: `oc get gateway -n -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}'` - - Gateway is ready when: PROGRAMMED condition is `True` AND ACCEPTED condition is `True` - - Display progress: "Waiting for Gateway to be programmed... (attempt X/60)" - - **Polling implementation:** - ```bash - TIMEOUT=300 - INTERVAL=5 - ELAPSED=0 - - # Wait for GatewayClass - while [ $ELAPSED -lt $TIMEOUT ]; do - ACCEPTED=$(oc get gatewayclass -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) - if [ "$ACCEPTED" = "True" ]; then - echo "✓ GatewayClass is accepted" - break - fi - echo "Waiting for GatewayClass to be accepted... ($(($ELAPSED))s / ${TIMEOUT}s)" - sleep $INTERVAL - ELAPSED=$(($ELAPSED + $INTERVAL)) - done - - # Wait for Gateway - ELAPSED=0 - while [ $ELAPSED -lt $TIMEOUT ]; do - PROGRAMMED=$(oc get gateway -n -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null) - ACCEPTED=$(oc get gateway -n -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null) - if [ "$PROGRAMMED" = "True" ] && [ "$ACCEPTED" = "True" ]; then - echo "✓ Gateway is ready" - break - fi - echo "Waiting for Gateway to be ready... ($(($ELAPSED))s / ${TIMEOUT}s)" - sleep $INTERVAL - ELAPSED=$(($ELAPSED + $INTERVAL)) - done - ``` - - **Timeout handling:** - - If timeout is reached before resources are ready: - - Display current status of resources with detailed condition information - - Show any error messages from status conditions - - Command should exit with an error status - - Display: "Timeout waiting for resources to be ready. Current status:" - - Display full resource status: `oc get gatewayclass -o yaml` and `oc get gateway -n -o yaml` - -8. **Final Verification and Summary** - - Once all resources are ready (or timeout occurred), display final summary: - - Check GatewayClass: `oc get gatewayclass` or `kubectl get gatewayclass` - - Check Gateway: `oc get gateway -A` or `kubectl get gateway -A` - - Display complete installation status with resource names, namespaces, and conditions - - If all resources are ready, display success message - - If timeout occurred, display error message with troubleshooting steps - -9. **Error Handling** - - If domain retrieval fails: - - Display the error and ask user to verify they're connected to an OpenShift cluster - - Suggest manual domain input - - If any YAML application fails: - - Display the error message - - Continue with remaining resources (don't fail fast) - - Provide summary of successful and failed resources at the end - - If resources don't become ready within timeout: - - Display current state of resources with full YAML output - - Show condition details and error messages - - Exit with error status - - Suggest troubleshooting steps (check controller logs, verify prerequisites) - -## Return Value -- **Success**: All resources are installed and ready - - GatewayClass ACCEPTED condition is `True` - - Gateway PROGRAMMED and ACCEPTED conditions are `True` - - Confirmation message with resource names, namespaces, and ready status -- **Timeout**: Resources were created but didn't become ready within 5 minutes - - Display current status of all resources - - Show condition details and any error messages - - Exit with error status -- **Failure**: Resources failed to apply - - Error message with details about what failed - - Troubleshooting steps - -## Examples - -1. **Install to default namespace**: - ```bash - /gwapi:install - ``` - Installs `gatewayclass.yaml` and `gateway.yaml` with the cluster's ingress domain automatically configured, then waits for resources to be ready. - - Example output: - ``` - Installing GatewayClass... - gatewayclass.gateway.networking.k8s.io/openshift-default created - Installing Gateway with domain: apps.example.com - gateway.gateway.networking.k8s.io/gateway created - Waiting for GatewayClass to be accepted... (0s / 300s) - Waiting for GatewayClass to be accepted... (5s / 300s) - ✓ GatewayClass is accepted - Waiting for Gateway to be ready... (0s / 300s) - Waiting for Gateway to be ready... (5s / 300s) - ✓ Gateway is ready - - Installation complete! All resources are ready. - ``` - -2. **Install to specific namespace**: - ```bash - /gwapi:install gateway-system - ``` - Installs both resources to the `gateway-system` namespace with domain substitution, then waits for resources to be ready. - -## Notes -- YAML files should be placed in `plugins/gwapi/resources/` directory: - - `gatewayclass.yaml` - GatewayClass definition - - `gateway.yaml` - Gateway definition with `${DOMAIN}` placeholder -- The `gateway.yaml` file should use `${DOMAIN}` as a placeholder for the cluster's ingress domain -- Domain is automatically retrieved from OpenShift cluster: `oc get ingresses.config/cluster -o jsonpath={.spec.domain}` -- Domain substitution is performed using `envsubst` which replaces `${DOMAIN}` with the actual cluster domain -- Resources are applied with `oc apply` which is idempotent - safe to run multiple times -- The command does not modify existing resources unless YAML content has changed -- The original YAML files are not modified; domain substitution happens in-memory during application -- **Waiting behavior:** - - Default timeout: 300 seconds (5 minutes) - - Poll interval: 5 seconds - - GatewayClass is considered ready when ACCEPTED condition is `True` - - Gateway is considered ready when both PROGRAMMED and ACCEPTED conditions are `True` - - If timeout is reached, the command exits with an error and displays the current resource status -- The command blocks until all resources are ready or timeout occurs -- Progress updates are displayed every 5 seconds during the wait diff --git a/plugins/gwapi/resources/gateway.yaml b/plugins/gwapi/resources/gateway.yaml deleted file mode 100644 index 11d795822..000000000 --- a/plugins/gwapi/resources/gateway.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: gateway.networking.k8s.io/v1 -kind: Gateway -metadata: - name: gateway - namespace: openshift-ingress -spec: - gatewayClassName: openshift-default - listeners: - - name: demo - hostname: "*.gwapi.${DOMAIN}" - port: 80 - protocol: HTTP - allowedRoutes: - namespaces: - from: All diff --git a/plugins/gwapi/resources/gatewayclass.yaml b/plugins/gwapi/resources/gatewayclass.yaml deleted file mode 100644 index c18472477..000000000 --- a/plugins/gwapi/resources/gatewayclass.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: gateway.networking.k8s.io/v1 -kind: GatewayClass -metadata: - name: openshift-default -spec: - controllerName: openshift.io/gateway-controller/v1 diff --git a/plugins/node-tuning/.claude-plugin/plugin.json b/plugins/node-tuning/.claude-plugin/plugin.json deleted file mode 100644 index 353c4a6b2..000000000 --- a/plugins/node-tuning/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "node-tuning", - "description": "Automatically create and apply tuned profile", - "version": "1.0.0", - "author": { - "name": "github.com/openshift-eng" - } -} \ No newline at end of file diff --git a/plugins/node-tuning/README.md b/plugins/node-tuning/README.md deleted file mode 100644 index 5e1d3ee2b..000000000 --- a/plugins/node-tuning/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Node Tuning Operator Plugin (node-tuning) - -## Overview -The `node-tuning` plugin automates common workflows for the OpenShift Node Tuning Operator. Use it when you need to: -- Generate reproducible Tuned manifests (`tuned.openshift.io/v1`) that capture sysctl settings, tuned daemon sections, and recommendation rules without hand-writing YAML. -- Audit live nodes or captured sosreports for kernel tuning gaps (CPU isolation, IRQ affinity, huge pages, net/sysctl state) and receive actionable remediation guidance. - -## Commands -- `/node-tuning:generate-tuned-profile` – Generate a Tuned profile manifest from a natural language description of the desired parameters, sections, and targeting rules. The command also supports advanced workflows such as coordinating huge pages or kernel-rt boot parameters with a dedicated MachineConfigPool. -- `/node-tuning:analyze-node-tuning` – Inspect a live node or sosreport snapshot for tuning signals (isolcpus, IRQ affinity, huge pages, sysctls, networking counters) and surface recommended adjustments. - -## Prerequisites -- Python 3.8 or newer must be available in the execution environment (the helper script is dependency-free beyond the standard library). -- Access to an OpenShift cluster if you plan to apply the generated manifest (`oc` CLI recommended for validation and application). -- Extracted sosreport directories when analyzing offline diagnostics (optional). - -## Typical Workflow -1. Invoke `/node-tuning:generate-tuned-profile` with a profile name, summary, and any sysctl, include, or section options. -2. Review the rendered YAML returned by the command or written to `.work/node-tuning//tuned.yaml` when using the helper script directly. -3. Validate the manifest with `oc apply --server-dry-run=client -f ` if desired. -4. Apply the manifest to the cluster or commit it to version control for automation. -5. Use the helper’s `--list-nodes` and `--label-node` options when you need to inspect or tag nodes before generating manifests. -6. For huge pages or other kernel boot parameters, coordinate with a dedicated MachineConfigPool as described in the advanced workflow inside `commands/generate-tuned-profile.md`. -7. Diagnose tuning gaps with `/node-tuning:analyze-node-tuning --format markdown` and translate the recommendations into updated Tuned profiles. When you cannot SSH to the node, supply `--node ` (plus optional `--kubeconfig`/`--oc-binary`) and the analyzer will, by default, enter the RHCOS `toolbox` (support-tools image) via `oc debug node/`, run `sosreport -e openshift ... --all-logs --plugin-timeout=600`, download the archive, and analyze it offline. Override the container image with `--toolbox-image` (or `TOOLBOX_IMAGE`) and extend/tune the sosreport flags with `--sosreport-arg`. HTTP(S) proxy variables are forwarded automatically when set, but they are entirely optional. Add `--no-collect-sosreport` if you prefer the lighter `/proc` snapshot workflow. - -## Related Files -- Command definition: `commands/generate-tuned-profile.md` -- Command definition: `commands/analyze-node-tuning.md` -- Helper implementation: `skills/scripts/generate_tuned_profile.py` -- Helper implementation: `skills/scripts/analyze_node_tuning.py` -- Skill documentation: `skills/scripts/SKILL.md` diff --git a/plugins/node-tuning/commands/analyze-node-tuning.md b/plugins/node-tuning/commands/analyze-node-tuning.md deleted file mode 100644 index 6373a465d..000000000 --- a/plugins/node-tuning/commands/analyze-node-tuning.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -description: Analyze kernel/sysctl tuning from a live node or sosreport snapshot and propose NTO recommendations -argument-hint: "[--sosreport PATH] [--format json|markdown] [--max-irq-samples N]" ---- - -## Name -node-tuning:analyze-node-tuning - -## Synopsis -```text -/node-tuning:analyze-node-tuning [--sosreport PATH] [--collect-sosreport|--no-collect-sosreport] [--sosreport-output PATH] [--node NODE] [--kubeconfig PATH] [--oc-binary PATH] [--format json|markdown] [--max-irq-samples N] [--keep-snapshot] -``` - -## Description -The `node-tuning:analyze-node-tuning` command inspects kernel tuning signals gathered from either a live OpenShift node (`/proc`, `/sys`), an `oc debug node/` snapshot captured via KUBECONFIG, or an extracted sosreport directory. It parses CPU isolation parameters, IRQ affinity, huge page allocation, critical sysctl settings, and networking counters before compiling actionable recommendations that can be enforced through Tuned profiles or MachineConfig updates. - -Use this command when you need to: -- Audit a node for tuning regressions after upgrades or configuration changes. -- Translate findings into remediation steps for the Node Tuning Operator. -- Produce JSON or Markdown reports suitable for incident response, CI gates, or documentation. - -## Implementation -1. **Establish data source** - - Live (local) analysis: the helper script defaults to `/proc` and `/sys`. Ensure the command runs on the target node (or within an SSH session / debug pod). - - Remote analysis via `oc debug`: provide `--node ` (plus optional `--kubeconfig` and `--oc-binary`). The helper defaults to entering the RHCOS `toolbox` (backed by the `registry.redhat.io/rhel9/support-tools` image) via `oc debug node/`, running `sosreport --batch --quiet -e openshift -e openshift_ovn -e openvswitch -e podman -e crio -k crio.all=on -k crio.logs=on -k podman.all=on -k podman.logs=on -k networking.ethtool-namespaces=off --all-logs --plugin-timeout=600`, streaming the archive locally (respecting `--sosreport-output` when set), and analyzing the extracted data. Use `--toolbox-image` (or `TOOLBOX_IMAGE`) to point at a mirrored support-tools image, `--sosreport-arg` to append extra flags (repeat per flag), or `--skip-default-sosreport-flags` to take full control. Host HTTP(S) proxy variables are forwarded when present but entirely optional. Add `--no-collect-sosreport` to skip sosreport generation entirely, and `--keep-snapshot` if you want to retain the downloaded files. - - Offline analysis: provide `--sosreport /path/to/sosreport-` pointing to an extracted sosreport directory; the script auto-discovers embedded `proc/` and `sys/` trees. - - Override non-standard layouts with `--proc-root` or `--sys-root` as needed. - -2. **Prepare workspace** - - Create `.work/node-tuning//` to store generated reports (remote snapshots and sosreport captures may reuse this path or default to a temporary directory). - - Decide whether you want Markdown (human-readable) or JSON (automation-ready) output. Set `--format json` and `--output` for machine consumption. - -3. **Invoke the analysis helper** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py \ - --sosreport "$SOS_DIR" \ - --format markdown \ - --max-irq-samples 10 \ - --output ".work/node-tuning/${HOSTNAME}/analysis.md" - ``` - - Omit `--sosreport` and `--node` to evaluate the local environment. - - Lower `--max-irq-samples` to cap the number of IRQ affinity overlaps listed in the report. - -4. **Interpret results** - - **System Overview**: Validates kernel release, NUMA nodes, and kernel cmdline flags (isolcpus, nohz_full, tuned.non_isolcpus). - - **CPU & Isolation**: Highlights SMT detection, isolated CPU masks, and mismatches between default IRQ affinity and isolated cores. - - **Huge Pages**: Summarizes global and per-NUMA huge page pools, reserved counts, and sysctl targets. - - **Sysctl Highlights**: Surfaces values for tuning-critical keys (e.g., `net.core.netdev_max_backlog`, `vm.swappiness`, THP state) with recommendations when thresholds are missed. - - **Network Signals**: Examines `TcpExt` counters and sockstat data for backlog drops, syncookie failures, or orphaned sockets. - - **IRQ Affinity**: Lists IRQs overlapping isolated CPUs so you can adjust tuned profiles or irqbalance policies. - - **Process Snapshot**: When available in sosreport snapshots, shows top CPU consumers and flags irqbalance presence. - -5. **Apply remediation** - - Feed the recommendations into `/node-tuning:generate-tuned-profile` or MachineConfig workflows. - - For immediate live tuning, adjust sysctls or interrupt affinities manually, then rerun the analysis to confirm remediation. - -## Return Value -- **Success**: Returns a Markdown or JSON report summarizing findings and recommended actions. -- **Failure**: Reports descriptive errors (e.g., missing `proc/` or `sys/` directories, unreadable sosreport path) and exits non-zero. - -## Examples - -1. **Analyze a live node and print Markdown** - ```text - /node-tuning:analyze-node-tuning --format markdown - ``` - -2. **Capture `/proc` and `/sys` via `oc debug` (sosreport by default) and analyze remotely** - ```text - /node-tuning:analyze-node-tuning \ - --node worker-rt-0 \ - --kubeconfig ~/.kube/prod \ - --format markdown - ``` - -3. **Collect a sosreport via `oc debug` (custom image + flags) and analyze it locally** - ```text - /node-tuning:analyze-node-tuning \ - --node worker-rt-0 \ - --toolbox-image registry.example.com/support-tools:latest \ - --sosreport-arg "--case-id=01234567" \ - --sosreport-output .work/node-tuning/sosreports \ - --format json - ``` - -4. **Inspect an extracted sosreport and save JSON to disk** - ```text - /node-tuning:analyze-node-tuning \ - --sosreport ~/Downloads/sosreport-worker-001 \ - --format json \ - --max-irq-samples 20 - ``` - -5. **Limit the recommendation set to a handful of IRQ overlaps** - ```text - /node-tuning:analyze-node-tuning --sosreport /tmp/sosreport --max-irq-samples 5 - ``` - -## Arguments: -- **--sosreport**: Path to an extracted sosreport directory to analyze instead of the live filesystem. -- **--format**: Output format (`markdown` default or `json` for structured data). -- **--output**: Optional file path where the helper writes the report. -- **--max-irq-samples**: Maximum number of IRQ affinity overlaps to include in the output (default 15). -- **--proc-root**: Override path to the procfs tree when auto-detection is insufficient. -- **--sys-root**: Override path to the sysfs tree when auto-detection is insufficient. -- **--node**: OpenShift node name to analyze via `oc debug node/` when direct access is not possible. -- **--kubeconfig**: Path to the kubeconfig file used for `oc debug`; relies on the current oc context when omitted. -- **--oc-binary**: Path to the `oc` binary (defaults to `$OC_BIN` or `oc`). -- **--keep-snapshot**: Preserve the temporary directory produced from `oc debug` (snapshots or sosreports) for later inspection. -- **--collect-sosreport**: Trigger `sosreport` via `oc debug node/`, download the archive, and analyze the extracted contents automatically (default behavior whenever `--node` is supplied and no other source is chosen). -- **--no-collect-sosreport**: Disable the default sosreport workflow when `--node` is supplied, falling back to the raw `/proc`/`/sys` snapshot. -- **--sosreport-output**: Directory where downloaded sosreport archives and their extraction should be placed (defaults to a temporary directory). -- **--toolbox-image**: Override the container image that toolbox pulls when collecting sosreport (defaults to `registry.redhat.io/rhel9/support-tools:latest` or `TOOLBOX_IMAGE` env). -- **--sosreport-arg**: Append an additional argument to the sosreport command (repeatable). -- **--skip-default-sosreport-flags**: Do not include the default OpenShift-focused sosreport plugins/collectors; only use values supplied via `--sosreport-arg`. - diff --git a/plugins/node-tuning/commands/generate-tuned-profile.md b/plugins/node-tuning/commands/generate-tuned-profile.md deleted file mode 100644 index afafd869d..000000000 --- a/plugins/node-tuning/commands/generate-tuned-profile.md +++ /dev/null @@ -1,200 +0,0 @@ ---- -description: Generate a Tuned (tuned.openshift.io/v1) profile manifest for the Node Tuning Operator -argument-hint: "[profile-name] [--summary ...] [--sysctl ...] [options]" ---- - -## Name -node-tuning:generate-tuned-profile - -## Synopsis -```text -/node-tuning:generate-tuned-profile [profile-name] [--summary TEXT] [--include VALUE ...] [--sysctl KEY=VALUE ...] [--match-label KEY[=VALUE] ...] [options] -``` - -## Description -The `node-tuning:generate-tuned-profile` command streamlines creation of `tuned.openshift.io/v1` manifests for the OpenShift Node Tuning Operator. It captures the desired Tuned profile metadata, tuned daemon configuration blocks (e.g. `[sysctl]`, `[variables]`, `[bootloader]`), and recommendation rules, then invokes the helper script at `plugins/node-tuning/skills/scripts/generate_tuned_profile.py` to render a ready-to-apply YAML file. - -Use this command whenever you need to: -- Bootstrap a new Tuned custom profile targeting selected nodes or machine config pools -- Generate manifests that can be version-controlled alongside other automation -- Iterate on sysctl, bootloader, or service parameters without hand-editing multi-line YAML - -The generated manifest follows the structure expected by the cluster Node Tuning Operator: -``` -apiVersion: tuned.openshift.io/v1 -kind: Tuned -metadata: - name: - namespace: openshift-cluster-node-tuning-operator -spec: - profile: - - data: | - [main] - summary=... - include=... - ... - name: - recommend: - - machineConfigLabels: {...} - match: - - label: ... - value: ... - priority: - profile: -``` - -## Implementation -1. **Collect inputs** - - Confirm Python 3.8+ is available (`python3 --version`). - - Gather the Tuned profile name, summary, optional include chain, sysctl values, variables, and any additional section lines (e.g. `[bootloader]`, `[service]`). - - Determine targeting rules: either `--match-label` entries (node labels) or `--machine-config-label` entries (MachineConfigPool selectors). - - Decide whether an accompanying MachineConfigPool (MCP) workflow is required for kernel boot arguments (see **Advanced Workflow** below). - - Use the helper's `--list-nodes` and `--label-node` flags when you need to inspect or label nodes prior to manifest generation. - -2. **Build execution workspace** - - Create or reuse `.work/node-tuning//`. - - Decide on the manifest filename (default `tuned.yaml` inside the workspace) or provide `--output` to override. - -3. **Invoke the generator script** - - Run the helper with the collected switches: - ```text - bash - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --profile-name "$PROFILE_NAME" \ - --summary "$SUMMARY" \ - --include openshift-node \ - --sysctl net.core.netdev_max_backlog=16384 \ - --variable isolated_cores=1 \ - --section bootloader:cmdline_ocp_realtime=+systemd.cpu_affinity=${not_isolated_cores_expanded} \ - --machine-config-label machineconfiguration.openshift.io/role=worker-rt \ - --match-label tuned.openshift.io/elasticsearch="" \ - --priority 25 \ - --output ".work/node-tuning/$PROFILE_NAME/tuned.yaml" - ``` - - Use `--dry-run` to print the manifest to stdout before writing, if desired. - -4. **Validate output** - - Inspect the generated YAML (`yq e . .work/node-tuning/$PROFILE_NAME/tuned.yaml` or open in an editor). - - Optionally run `oc apply --server-dry-run=client -f .work/node-tuning/$PROFILE_NAME/tuned.yaml` to confirm schema compatibility. - -5. **Apply or distribute** - - Apply to a cluster with `oc apply -f .work/node-tuning/$PROFILE_NAME/tuned.yaml`. - - Commit the manifest to Git or attach to automated pipelines as needed. - -## Advanced Workflow: Huge Pages with a Dedicated MachineConfigPool -Use this workflow when enabling huge pages or other kernel boot parameters that require coordinating the Node Tuning Operator with the Machine Config Operator while minimizing reboots. - -1. **Label target nodes** - - Preview candidates: `python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py --list-nodes --node-selector "node-role.kubernetes.io/worker" --skip-manifest`. - - Label workers with the helper (repeat per node): - ```text - bash - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --label-node ip-10-0-1-23.ec2.internal:node-role.kubernetes.io/worker-hp= \ - --overwrite-labels \ - --skip-manifest - ``` - - Alternatively run `oc label node node-role.kubernetes.io/worker-hp=` directly if you prefer the CLI. - -2. **Generate the Tuned manifest** - - Include bootloader arguments via the helper script: - ```text - bash - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --profile-name "openshift-node-hugepages" \ - --summary "Boot time configuration for hugepages" \ - --include openshift-node \ - --section bootloader:cmdline_openshift_node_hugepages="hugepagesz=2M hugepages=50" \ - --machine-config-label machineconfiguration.openshift.io/role=worker-hp \ - --priority 30 \ - --output .work/node-tuning/openshift-node-hugepages/hugepages-tuned-boottime.yaml - ``` - - Review the `[bootloader]` section to ensure the kernel arguments match the desired configuration (e.g. `kernel-rt`, huge pages, additional sysctls). - -3. **Author the MachineConfigPool manifest** - - Create `.work/node-tuning/openshift-node-hugepages/hugepages-mcp.yaml` with: - ```yaml - apiVersion: machineconfiguration.openshift.io/v1 - kind: MachineConfigPool - metadata: - name: worker-hp - labels: - worker-hp: "" - spec: - machineConfigSelector: - matchExpressions: - - key: machineconfiguration.openshift.io/role - operator: In - values: - - worker - - worker-hp - nodeSelector: - matchLabels: - node-role.kubernetes.io/worker-hp: "" - ``` - -4. **Apply manifests (optional `--dry-run`)** - - `oc apply -f .work/node-tuning/openshift-node-hugepages/hugepages-tuned-boottime.yaml` - - `oc apply -f .work/node-tuning/openshift-node-hugepages/hugepages-mcp.yaml` - - Watch progress: `oc get mcp worker-hp -w` - -5. **Verify results** - - Confirm huge page allocation after the reboot: `oc get node -o jsonpath="{.status.allocatable.hugepages-2Mi}"` - - Inspect kernel arguments: `oc debug node/ -q -- chroot /host cat /proc/cmdline` - -## Return Value -- **Success**: Path to the generated manifest and the profile name are returned to the caller. -- **Failure**: Script exits non-zero with stderr diagnostics (e.g. invalid `KEY=VALUE` pair, missing labels, unwritable output path). - -## Examples - -1. **Realtime worker profile targeting worker-rt MCP** - ```text - /node-tuning:generate-tuned-profile openshift-realtime \ - --summary "Custom realtime tuned profile" \ - --include openshift-node --include realtime \ - --variable isolated_cores=1 \ - --section bootloader:cmdline_ocp_realtime=+systemd.cpu_affinity=${not_isolated_cores_expanded} \ - --machine-config-label machineconfiguration.openshift.io/role=worker-rt \ - --output .work/node-tuning/openshift-realtime/realtime.yaml - ``` - -2. **Sysctl-only profile matched by node label** - ```text - /node-tuning:generate-tuned-profile custom-net-tuned \ - --summary "Increase conntrack table" \ - --sysctl net.netfilter.nf_conntrack_max=262144 \ - --match-label tuned.openshift.io/custom-net \ - --priority 18 - ``` - -3. **Preview manifest without writing to disk** - ```text - /node-tuning:generate-tuned-profile pidmax-test \ - --summary "Raise pid max" \ - --sysctl kernel.pid_max=131072 \ - --match-label tuned.openshift.io/pidmax="" \ - --dry-run - ``` - -## Arguments: -- **$1** (`profile-name`): Name for the Tuned profile and manifest resource. -- **--summary**: Required summary string placed in the `[main]` section. -- **--include**: Optional include chain entries (multiple allowed). -- **--main-option**: Additional `[main]` section key/value pairs (`KEY=VALUE`). -- **--variable**: Add entries to the `[variables]` section (`KEY=VALUE`). -- **--sysctl**: Add sysctl settings to the `[sysctl]` section (`KEY=VALUE`). -- **--section**: Add lines to arbitrary sections using `SECTION:KEY=VALUE`. -- **--machine-config-label**: MachineConfigPool selector labels (`key=value`) applied under `machineConfigLabels`. -- **--match-label**: Node selector labels for the `recommend[].match[]` block; omit `=value` to match existence only. -- **--priority**: Recommendation priority (integer, default 20). -- **--namespace**: Override the manifest namespace (default `openshift-cluster-node-tuning-operator`). -- **--output**: Destination file path; defaults to `.yaml` in the current directory. -- **--dry-run**: Print manifest to stdout instead of writing to a file. -- **--skip-manifest**: Skip manifest generation; useful when only listing or labeling nodes. -- **--list-nodes**: List nodes via `oc get nodes` (works with `--node-selector`). -- **--node-selector**: Label selector applied when `--list-nodes` is used. -- **--label-node**: Apply labels to nodes using `NODE:KEY[=VALUE]` notation; repeatable. -- **--overwrite-labels**: Allow overwriting existing labels when labeling nodes. -- **--oc-binary**: Path to the `oc` executable (defaults to `$OC_BIN` or `oc`). - diff --git a/plugins/node-tuning/skills/scripts/SKILL.md b/plugins/node-tuning/skills/scripts/SKILL.md deleted file mode 100644 index 050a9834d..000000000 --- a/plugins/node-tuning/skills/scripts/SKILL.md +++ /dev/null @@ -1,183 +0,0 @@ ---- -name: Node Tuning Helper Scripts -description: Generate tuned manifests and evaluate node tuning snapshots ---- - -# Node Tuning Helper Scripts - -Detailed instructions for invoking the helper utilities that back `/node-tuning` commands: -- `generate_tuned_profile.py` renders Tuned manifests (`tuned.openshift.io/v1`). -- `analyze_node_tuning.py` inspects live nodes or sosreports for tuning gaps. - -## When to Use These Scripts -- Translate structured command inputs into Tuned manifests for the Node Tuning Operator. -- Iterate on generated YAML outside the assistant or integrate the generator into automation. -- Analyze CPU isolation, IRQ affinity, huge pages, sysctl values, and networking counters from live clusters or archived sosreports. - -## Prerequisites -- Python 3.8 or newer (`python3 --version`). -- Repository checkout so the scripts under `plugins/node-tuning/skills/scripts/` are accessible. -- Optional: `oc` CLI when validating or applying manifests. -- Optional: Extracted sosreport directory when running the analysis script offline. -- Optional (remote analysis): `oc` CLI access plus a valid `KUBECONFIG` when capturing `/proc`/`/sys` or sosreport via `oc debug node/`. The sosreport workflow pulls the `registry.redhat.io/rhel9/support-tools` image (override with `--toolbox-image` or `TOOLBOX_IMAGE`) and requires registry access. HTTP(S) proxy env vars from the host are forwarded automatically when present, but using a proxy is optional. - ---- - -## Script: `generate_tuned_profile.py` - -### Implementation Steps -1. **Collect Inputs** - - `--profile-name`: Tuned resource name. - - `--summary`: `[main]` section summary. - - Repeatable options: `--include`, `--main-option`, `--variable`, `--sysctl`, `--section` (`SECTION:KEY=VALUE`). - - Target selectors: `--machine-config-label key=value`, `--match-label key[=value]`. - - Optional: `--priority` (default 20), `--namespace`, `--output`, `--dry-run`. - - Use `--list-nodes`/`--node-selector` to inspect nodes and `--label-node NODE:KEY[=VALUE]` (plus `--overwrite-labels`) to tag machines. - -2. **Inspect or Label Nodes (optional)** - ```bash - # List all worker nodes - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py --list-nodes --node-selector "node-role.kubernetes.io/worker" --skip-manifest - - # Label a specific node for the worker-hp pool - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --label-node ip-10-0-1-23.ec2.internal:node-role.kubernetes.io/worker-hp= \ - --overwrite-labels \ - --skip-manifest - ``` - -3. **Render the Manifest** - ```bash - python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --profile-name "$PROFILE" \ - --summary "$SUMMARY" \ - --sysctl net.core.netdev_max_backlog=16384 \ - --match-label tuned.openshift.io/custom-net \ - --output .work/node-tuning/$PROFILE/tuned.yaml - ``` - - Omit `--output` to write `.yaml` in the current directory. - - Add `--dry-run` to print the manifest to stdout. - -4. **Review Output** - - Inspect the generated YAML for accuracy. - - Optionally format with `yq` or open in an editor for readability. - -5. **Validate and Apply** - - Dry-run: `oc apply --server-dry-run=client -f `. - - Apply: `oc apply -f `. - -### Error Handling -- Missing required options raise `ValueError` with descriptive messages. -- The script exits non-zero when no target selectors (`--machine-config-label` or `--match-label`) are supplied. -- Invalid key/value or section inputs identify the failing argument explicitly. - -### Examples -```bash -python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --profile-name realtime-worker \ - --summary "Realtime tuned profile" \ - --include openshift-node --include realtime \ - --variable isolated_cores=1 \ - --section bootloader:cmdline_ocp_realtime=+systemd.cpu_affinity=${not_isolated_cores_expanded} \ - --machine-config-label machineconfiguration.openshift.io/role=worker-rt \ - --priority 25 \ - --output .work/node-tuning/realtime-worker/tuned.yaml -``` -```bash -python3 plugins/node-tuning/skills/scripts/generate_tuned_profile.py \ - --profile-name openshift-node-hugepages \ - --summary "Boot time configuration for hugepages" \ - --include openshift-node \ - --section bootloader:cmdline_openshift_node_hugepages="hugepagesz=2M hugepages=50" \ - --machine-config-label machineconfiguration.openshift.io/role=worker-hp \ - --priority 30 \ - --output .work/node-tuning/openshift-node-hugepages/hugepages-tuned-boottime.yaml -``` - ---- - -## Script: `analyze_node_tuning.py` - -### Purpose -Inspect either a live node (`/proc`, `/sys`) or an extracted sosreport snapshot for tuning signals (CPU isolation, IRQ affinity, huge pages, sysctl state, networking counters) and emit actionable recommendations. - -### Usage Patterns -- **Live node analysis** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py --format markdown - ``` -- **Remote analysis via oc debug** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py \ - --node worker-rt-0 \ - --kubeconfig ~/.kube/prod \ - --format markdown - ``` -- **Collect sosreport via oc debug and analyze locally** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py \ - --node worker-rt-0 \ - --toolbox-image registry.example.com/support-tools:latest \ - --sosreport-arg "--case-id=01234567" \ - --sosreport-output .work/node-tuning/sosreports \ - --format json - ``` -- **Offline sosreport analysis** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py \ - --sosreport /path/to/sosreport-2025-10-20 - ``` -- **Automation-friendly JSON** - ```bash - python3 plugins/node-tuning/skills/scripts/analyze_node_tuning.py \ - --sosreport /path/to/sosreport \ - --format json --output .work/node-tuning/node-analysis.json - ``` - -### Implementation Steps -1. **Select data source** - - Provide `--node ` (with optional `--kubeconfig` / `--oc-binary`). By default the helper runs `sosreport` remotely from inside the RHCOS toolbox container (`registry.redhat.io/rhel9/support-tools`). Override the image with `--toolbox-image`, extend the sosreport command with `--sosreport-arg`, or disable the curated OpenShift flags via `--skip-default-sosreport-flags`. Pass `--no-collect-sosreport` to fall back to the direct `/proc` snapshot mode. - - Provide `--sosreport ` for archived diagnostics; detection finds embedded `proc/` and `sys/`. - - Omit both switches to query the live filesystem (defaults to `/proc` and `/sys`). - - Override paths with `--proc-root` or `--sys-root` when the layout differs. -2. **Run analysis** - - The script parses `cpuinfo`, kernel cmdline parameters (`isolcpus`, `nohz_full`, `tuned.non_isolcpus`), default IRQ affinities, huge page counters, sysctl values (net, vm, kernel), transparent hugepage settings, `netstat`/`sockstat` counters, and `ps` snapshots (when available in sosreport). -3. **Review the report** - - Markdown output groups findings by section (System Overview, CPU & Isolation, Huge Pages, Sysctl Highlights, Network Signals, IRQ Affinity, Process Snapshot) and lists recommendations. - - JSON output contains the same information in structured form for pipelines or dashboards. -4. **Act on recommendations** - - Apply Tuned profiles, MachineConfig updates, or manual sysctl/irqbalance adjustments. - - Feed actionable items back into `/node-tuning:generate-tuned-profile` to codify desired state. - -### Error Handling -- Missing `proc/` or `sys/` directories trigger descriptive errors. -- Unreadable files are skipped gracefully and noted in observations where relevant. -- Non-numeric sysctl values are flagged for manual investigation. - -### Example Output (Markdown excerpt) -``` -# Node Tuning Analysis - -## System Overview -- Hostname: worker-rt-1 -- Kernel: 4.18.0-477.el8 -- NUMA nodes: 2 -- Kernel cmdline: `BOOT_IMAGE=... isolcpus=2-15 tuned.non_isolcpus=0-1` - -## CPU & Isolation -- Logical CPUs: 32 -- Physical cores: 16 across 2 socket(s) -- SMT detected: yes -- Isolated CPUs: 2-15 -... - -## Recommended Actions -- Configure net.core.netdev_max_backlog (>=32768) to accommodate bursty NIC traffic. -- Transparent Hugepages are not disabled (`[never]` not selected). Consider setting to `never` for latency-sensitive workloads. -- 4 IRQs overlap isolated CPUs. Relocate interrupt affinities using tuned profiles or irqbalance. -``` - -### Follow-up Automation Ideas -- Persist JSON results in `.work/node-tuning//analysis.json` for historical tracing. -- Gate upgrades by comparing recommendations across nodes. -- Integrate with CI jobs that validate cluster tuning post-change. diff --git a/plugins/node-tuning/skills/scripts/analyze_node_tuning.py b/plugins/node-tuning/skills/scripts/analyze_node_tuning.py deleted file mode 100644 index 7a0d15ed6..000000000 --- a/plugins/node-tuning/skills/scripts/analyze_node_tuning.py +++ /dev/null @@ -1,1292 +0,0 @@ -""" -Analyze kernel and node tuning state from a live OpenShift node or an extracted -Linux sosreport directory. The script inspects procfs/sysfs snapshots for -signals related to CPU isolation, IRQ affinity, huge pages, and networking -queues, then emits actionable tuning recommendations. - -The implementation remains dependency-free so it can run anywhere Python 3.8+ -is available (CI, developer workstations, or automation pipelines). -""" - -from __future__ import annotations - -import argparse -import atexit -import json -import os -import shlex -import shutil -import subprocess -import sys -import tarfile -import tempfile -import textwrap -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Iterable, List, Optional, Sequence, Tuple - - -@dataclass(frozen=True) -class EnvironmentPaths: - base: Path - proc: Path - sys: Path - sos_commands: Optional[Path] - - -DEFAULT_OC_BINARY = os.environ.get("OC_BIN", "oc") -DEFAULT_TOOLBOX_IMAGE = os.environ.get("TOOLBOX_IMAGE", "registry.redhat.io/rhel9/support-tools:latest") - -DEFAULT_SOSREPORT_FLAGS: List[str] = [ - "-e", - "openshift", - "-e", - "openshift_ovn", - "-e", - "openvswitch", - "-e", - "podman", - "-e", - "crio", - "-k", - "crio.all=on", - "-k", - "crio.logs=on", - "-k", - "podman.all=on", - "-k", - "podman.logs=on", - "-k", - "networking.ethtool-namespaces=off", - "--all-logs", - "--plugin-timeout=600", -] - -SNAPSHOT_ITEMS = [ - "proc/cmdline", - "proc/cpuinfo", - "proc/meminfo", - "proc/net", - "proc/irq", - "proc/sys", - "proc/uptime", - "proc/version", - "proc/sys/kernel", - "proc/sys/vm", - "proc/sys/net", - "proc/net/netstat", - "proc/net/snmp", - "proc/net/sockstat", - "sys/devices/system/node", - "sys/kernel/mm/transparent_hugepage", -] - - -def parse_arguments(argv: Iterable[str]) -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=( - "Analyze kernel tuning signals from a live node (/proc, /sys) or an " - "extracted sosreport directory." - ), - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "--sosreport", - help=( - "Path to an extracted sosreport directory. The script will locate the " - "embedded proc/ and sys/ trees automatically." - ), - ) - parser.add_argument( - "--root", - default="", - help=( - "Root path of a filesystem snapshot containing proc/ and sys/ " - "(defaults to the live '/' filesystem when unset)." - ), - ) - parser.add_argument( - "--proc-root", - help="Explicit path to the procfs tree. Overrides auto-detection.", - ) - parser.add_argument( - "--sys-root", - help="Explicit path to the sysfs tree. Overrides auto-detection.", - ) - parser.add_argument( - "--node", - help=( - "OpenShift node name to inspect via `oc debug node/`. " - "The script captures relevant /proc and /sys data using the provided KUBECONFIG." - ), - ) - parser.add_argument( - "--kubeconfig", - help="Path to the kubeconfig file used for oc debug commands (defaults to current oc context).", - ) - parser.add_argument( - "--oc-binary", - default=DEFAULT_OC_BINARY, - help="Path to the oc CLI binary.", - ) - parser.add_argument( - "--keep-snapshot", - action="store_true", - help="Keep temporary artifacts (oc-debug snapshots or sosreports) instead of deleting them on exit.", - ) - parser.add_argument( - "--collect-sosreport", - dest="collect_sosreport", - action="store_true", - help=( - "Use `oc debug node/` to run sosreport on the target node, download the archive, " - "and analyze it as an extracted sosreport." - ), - ) - parser.add_argument( - "--no-collect-sosreport", - dest="collect_sosreport", - action="store_false", - help="Disable automatic sosreport collection when targeting a live cluster via --node.", - ) - parser.set_defaults(collect_sosreport=True) - parser.add_argument( - "--sosreport-output", - help=( - "Optional directory to store downloaded sosreport archives and their extraction. " - "Defaults to a temporary directory when omitted." - ), - ) - parser.add_argument( - "--toolbox-image", - default=DEFAULT_TOOLBOX_IMAGE, - help="Container image used by toolbox when collecting sosreport (default: %(default)s).", - ) - parser.add_argument( - "--sosreport-arg", - action="append", - default=[], - metavar="ARG", - help="Additional argument to pass to the sosreport command (repeatable).", - ) - parser.add_argument( - "--skip-default-sosreport-flags", - action="store_true", - help="Do not include the default OpenShift-focused sosreport flags; only use custom --sosreport-arg values.", - ) - parser.add_argument( - "--format", - choices=("markdown", "json"), - default="markdown", - help="Output format.", - ) - parser.add_argument( - "--output", - help="Optional path to write the report. Defaults to stdout when omitted.", - ) - parser.add_argument( - "--max-irq-samples", - type=int, - default=15, - help="Limit how many IRQ affinity mismatches are included in the report.", - ) - return parser.parse_args(argv) - - -def resolve_environment(args: argparse.Namespace) -> EnvironmentPaths: - collect_pref = args.collect_sosreport - if collect_pref and not args.node: - # Cannot collect sosreport without a target node; defer to other sources. - collect_pref = False - - if collect_pref: - if args.sosreport: - raise ValueError("Cannot combine --collect-sosreport with --sosreport.") - if not args.node: - raise ValueError("Sosreport collection requires --node.") - sos_dir = collect_sosreport_via_oc_debug( - node=args.node, - oc_binary=args.oc_binary, - kubeconfig=args.kubeconfig, - keep_snapshot=args.keep_snapshot, - output_base=args.sosreport_output, - toolbox_image=args.toolbox_image, - proxy_exports_script=_build_proxy_exports_script(), - sosreport_flag_string=_build_sosreport_flag_string( - use_defaults=not args.skip_default_sosreport_flags, - extra_args=args.sosreport_arg, - ), - ) - return _resolve_sosreport_dir(sos_dir) - - if args.sosreport: - return _resolve_sosreport_dir(Path(args.sosreport)) - - if args.node: - return capture_node_snapshot( - node=args.node, - oc_binary=args.oc_binary, - kubeconfig=args.kubeconfig, - keep_snapshot=args.keep_snapshot, - ) - - root = Path(args.root or "/").expanduser().resolve() - if not root.exists(): - raise FileNotFoundError(f"root path '{root}' does not exist") - proc_root = Path(args.proc_root).expanduser().resolve() if args.proc_root else root / "proc" - sys_root = Path(args.sys_root).expanduser().resolve() if args.sys_root else root / "sys" - if not proc_root.is_dir(): - raise FileNotFoundError(f"proc path '{proc_root}' does not exist or is not a directory") - if not sys_root.is_dir(): - raise FileNotFoundError(f"sys path '{sys_root}' does not exist or is not a directory") - return EnvironmentPaths(base=root, proc=proc_root, sys=sys_root, sos_commands=None) - - -def _safe_extract_tar(archive: Path, destination: Path) -> None: - with tarfile.open(archive, "r:*") as tar: - for member in tar.getmembers(): - member_path = destination / member.name - if not _is_within_directory(destination, member_path): - raise ValueError("Archive extraction attempted path traversal.") - tar.extractall(destination) - - -def _is_within_directory(directory: Path, target: Path) -> bool: - directory = directory.resolve() - if not directory.exists(): - directory.mkdir(parents=True, exist_ok=True) - try: - target.resolve(strict=False).relative_to(directory) - return True - except ValueError: - return False - - -def _create_artifact_dir(base_dir: Optional[str], prefix: str) -> Path: - if base_dir: - base = Path(base_dir).expanduser().resolve() - base.mkdir(parents=True, exist_ok=True) - path = Path(tempfile.mkdtemp(prefix=f"{prefix}-", dir=str(base))) - else: - path = Path(tempfile.mkdtemp(prefix=f"{prefix}-")) - return path - - -def _resolve_sosreport_dir(path: Path) -> EnvironmentPaths: - base = Path(path).expanduser().resolve() - if not base.exists(): - raise FileNotFoundError(f"sosreport path '{base}' does not exist") - if base.is_file(): - raise ValueError(f"sosreport path '{base}' is a file; provide an extracted directory") - root_candidates = [base] + [child for child in base.iterdir() if child.is_dir()] - proc_root: Optional[Path] = None - sys_root: Optional[Path] = None - sos_commands: Optional[Path] = None - selected_base = base - for candidate in root_candidates: - candidate_proc = candidate / "proc" - if candidate_proc.is_dir(): - proc_root = candidate_proc - candidate_sys = candidate / "sys" - if candidate_sys.is_dir(): - sys_root = candidate_sys - if (candidate / "sos_commands").is_dir(): - sos_commands = candidate / "sos_commands" - if proc_root and sys_root: - selected_base = candidate - break - if proc_root is None: - raise FileNotFoundError(f"Unable to locate a proc/ directory under '{base}'") - if sys_root is None: - possible_sys = proc_root.parent / "sys" - if possible_sys.is_dir(): - sys_root = possible_sys - else: - raise FileNotFoundError(f"Unable to locate a sys/ directory under '{base}'") - return EnvironmentPaths(base=selected_base, proc=proc_root, sys=sys_root, sos_commands=sos_commands) - - -def _build_proxy_exports_script() -> str: - proxy_vars = [ - "HTTP_PROXY", - "http_proxy", - "HTTPS_PROXY", - "https_proxy", - "NO_PROXY", - "no_proxy", - ] - lines = [] - for var in proxy_vars: - value = os.environ.get(var) - if value: - lines.append(f"export {var}={shlex.quote(value)}") - return "\n".join(lines) - - -def _build_sosreport_flag_string(*, use_defaults: bool, extra_args: Sequence[str]) -> str: - flags: List[str] = [] - if use_defaults: - flags.extend(DEFAULT_SOSREPORT_FLAGS) - flags.extend(extra_args) - if not flags: - return "" - return " ".join(shlex.quote(flag) for flag in flags) - - -def collect_sosreport_via_oc_debug( - *, - node: str, - oc_binary: str, - kubeconfig: Optional[str], - keep_snapshot: bool, - output_base: Optional[str], - toolbox_image: str, - proxy_exports_script: str, - sosreport_flag_string: str, -) -> Path: - safe_node = node.replace("/", "-") - artifact_dir = _create_artifact_dir(output_base, f"node-tuning-sosreport-{safe_node}") - if not keep_snapshot: - atexit.register(lambda: shutil.rmtree(artifact_dir, ignore_errors=True)) - - archive_path = artifact_dir / "sosreport.tar" - extract_dir = artifact_dir / "extracted" - extract_dir.mkdir(parents=True, exist_ok=True) - - archive_host_path = "/tmp/node-tuning-sosreport.tar.xz" - payload_host_path = "/tmp/node-tuning-toolbox.sh" - proxy_block = "" - if proxy_exports_script.strip(): - proxy_block = textwrap.dedent( - f""" - PROXY_EXPORTS=$(cat <<'__NTO_PROXY__' - {proxy_exports_script} - __NTO_PROXY__ - ) - eval "$PROXY_EXPORTS" - """ - ).strip() - - sosreport_flag_string = sosreport_flag_string or "" - remote_script = textwrap.dedent( - f""" - set -euo pipefail - TOOLBOX_IMAGE={shlex.quote(toolbox_image)} - ARCHIVE_PATH="{archive_host_path}" - PAYLOAD="{payload_host_path}" - TOOLBOX_LOG="/tmp/node-tuning-toolbox.log" - {proxy_block} - cat <<'__NTO_PAYLOAD__' > "$PAYLOAD" - set -euo pipefail - TMPDIR=$(mktemp -d /var/tmp/node-tuning-sos.XXXX) - cleanup() {{ rm -rf "$TMPDIR"; }} - trap cleanup EXIT - SOSREPORT_FLAGS="{sosreport_flag_string}" - sosreport --batch --quiet --tmp-dir "$TMPDIR" $SOSREPORT_FLAGS >/dev/null - LATEST=$(ls -1tr "$TMPDIR"/sosreport-* 2>/dev/null | tail -1) - if [ -z "$LATEST" ]; then - echo "Unable to locate sosreport archive" >&2 - exit 1 - fi - mkdir -p "$(dirname "/host{archive_host_path}")" - cp "$LATEST" "/host{archive_host_path}" - __NTO_PAYLOAD__ - - remove_existing() {{ - podman rm -f toolbox- >/dev/null 2>&1 || true - toolbox rm -f node-tuning-sos >/dev/null 2>&1 || true - }} - - remove_existing - - run_toolbox() {{ - local status=0 - if command -v script >/dev/null 2>&1; then - script -q -c "toolbox --container node-tuning-sos --image $TOOLBOX_IMAGE -- /bin/bash /host$PAYLOAD" /dev/null >> "$TOOLBOX_LOG" 2>&1 || status=$? - else - toolbox --container node-tuning-sos --image "$TOOLBOX_IMAGE" -- /bin/bash "/host$PAYLOAD" >> "$TOOLBOX_LOG" 2>&1 || status=$? - fi - return "$status" - }} - - if ! run_toolbox; then - echo "toolbox execution failed; falling back to host sosreport (inspect $TOOLBOX_LOG)" >&2 - if ! bash "/host$PAYLOAD" >> "$TOOLBOX_LOG" 2>&1; then - echo "host sosreport fallback failed; inspect $TOOLBOX_LOG" >&2 - exit 1 - fi - fi - - rm -f "$PAYLOAD" - - if [ ! -s "{archive_host_path}" ]; then - echo "sosreport archive missing after execution; inspect $TOOLBOX_LOG" >&2 - exit 1 - fi - - cat "{archive_host_path}" - rm -f "{archive_host_path}" "$TOOLBOX_LOG" - """ - ).strip() - - cmd: List[str] = [oc_binary] - if kubeconfig: - cmd.extend(["--kubeconfig", kubeconfig]) - cmd.extend( - [ - "debug", - f"node/{node}", - "--quiet", - "--", - "/bin/bash", - "-c", - f"chroot /host /bin/bash -c {shlex.quote(remote_script)}", - ] - ) - - try: - with archive_path.open("wb") as archive_handle: - result = subprocess.run( - cmd, - check=False, - stdout=archive_handle, - stderr=subprocess.PIPE, - text=True, - ) - except FileNotFoundError as exc: - raise FileNotFoundError(f"Unable to execute oc binary '{oc_binary}': {exc}") from exc - - if result.returncode != 0: - stderr_output = result.stderr.strip() if result.stderr else "unknown error" - raise RuntimeError(f"`oc debug node/{node}` sosreport capture failed: {stderr_output}") - - _safe_extract_tar(archive_path, extract_dir) - - # Choose the first directory that contains proc/. - candidates = [p for p in extract_dir.rglob("proc") if p.is_dir()] - if not candidates: - raise FileNotFoundError("Downloaded sosreport archive did not contain a proc/ directory.") - sos_base = candidates[0].parent - return sos_base - - -def capture_node_snapshot( - *, - node: str, - oc_binary: str, - kubeconfig: Optional[str], - keep_snapshot: bool, -) -> EnvironmentPaths: - tmp_dir = Path(tempfile.mkdtemp(prefix="node-tuning-")) - if not keep_snapshot: - atexit.register(lambda: shutil.rmtree(tmp_dir, ignore_errors=True)) - - tar_path = tmp_dir / "snapshot.tar" - include_args = " ".join(shlex.quote(item) for item in SNAPSHOT_ITEMS) - remote_cmd = ( - "chroot /host /bin/bash -c " - f"'cd / && tar --ignore-failed-read --warning=no-file-changed -cf - {include_args}'" - ) - - cmd: List[str] = [oc_binary] - if kubeconfig: - cmd.extend(["--kubeconfig", kubeconfig]) - cmd.extend(["debug", f"node/{node}", "--quiet", "--", "/bin/bash", "-c", remote_cmd]) - - try: - with tar_path.open("wb") as tar_handle: - result = subprocess.run( - cmd, - check=False, - stdout=tar_handle, - stderr=subprocess.PIPE, - text=True, - ) - except FileNotFoundError as exc: - raise FileNotFoundError( - f"Unable to execute oc binary '{oc_binary}': {exc}" - ) from exc - - if result.returncode != 0: - stderr_output = result.stderr.strip() if result.stderr else "unknown error" - raise RuntimeError( - f"`oc debug node/{node}` failed (exit {result.returncode}): {stderr_output}" - ) - - _safe_extract_tar(tar_path, tmp_dir) - if not keep_snapshot: - tar_path.unlink(missing_ok=True) # type: ignore[arg-type] - - proc_path = tmp_dir / "proc" - sys_path = tmp_dir / "sys" - if not proc_path.exists(): - raise FileNotFoundError("Captured snapshot is missing proc/ data from the node.") - if not sys_path.exists(): - raise FileNotFoundError("Captured snapshot is missing sys/ data from the node.") - return EnvironmentPaths(base=tmp_dir, proc=proc_path, sys=sys_path, sos_commands=None) - - -def _safe_read_text(path: Path) -> Optional[str]: - try: - return path.read_text(encoding="utf-8", errors="ignore") - except (FileNotFoundError, IsADirectoryError, PermissionError): - return None - - -def _safe_read_int(path: Path) -> Optional[int]: - text = _safe_read_text(path) - if text is None: - return None - text = text.strip() - if not text: - return None - try: - return int(text) - except ValueError: - return None - - -def _parse_kernel_cmdline(raw_cmdline: Optional[str]) -> Tuple[str, Dict[str, List[str]]]: - if not raw_cmdline: - return "", {} - cmdline = raw_cmdline.replace("\x00", " ").strip() - params: Dict[str, List[str]] = {} - for token in cmdline.split(): - if "=" in token: - key, value = token.split("=", 1) - else: - key, value = token, "" - params.setdefault(key, []).append(value) - return cmdline, params - - -def _parse_cpu_list(expression: str) -> List[int]: - cpus: List[int] = [] - for part in expression.split(","): - part = part.strip() - if not part: - continue - if "-" in part: - start_str, end_str = part.split("-", 1) - try: - start = int(start_str) - end = int(end_str) - except ValueError: - continue - cpus.extend(range(min(start, end), max(start, end) + 1)) - else: - try: - cpus.append(int(part)) - except ValueError: - continue - return sorted(set(cpus)) - - -def _parse_cpu_mask(mask: str) -> List[int]: - cleaned = mask.strip().replace(",", "") - if not cleaned: - return [] - try: - value = int(cleaned, 16) - except ValueError: - return [] - cpus: List[int] = [] - bit = 0 - while value: - if value & 1: - cpus.append(bit) - value >>= 1 - bit += 1 - return cpus - - -def gather_system_info(env: EnvironmentPaths) -> Dict[str, object]: - hostname = _safe_read_text(env.proc / "sys/kernel/hostname") - kernel_release = _safe_read_text(env.proc / "sys/kernel/osrelease") - kernel_version = _safe_read_text(env.proc / "version") - uptime_text = _safe_read_text(env.proc / "uptime") - if uptime_text: - try: - uptime_seconds = float(uptime_text.split()[0]) - except (ValueError, IndexError): - uptime_seconds = None - else: - uptime_seconds = None - cmdline_raw = _safe_read_text(env.proc / "cmdline") - cmdline, cmd_params = _parse_kernel_cmdline(cmdline_raw) - num_nodes = 0 - nodes_path = env.sys / "devices/system/node" - if nodes_path.is_dir(): - num_nodes = sum(1 for entry in nodes_path.iterdir() if entry.name.startswith("node")) - return { - "hostname": (hostname or "").strip(), - "kernel_release": (kernel_release or "").strip(), - "kernel_version": (kernel_version or "").strip(), - "uptime_seconds": uptime_seconds, - "kernel_cmdline": cmdline, - "kernel_cmdline_params": cmd_params, - "numa_nodes": num_nodes, - } - - -def gather_cpu_info(env: EnvironmentPaths, cmd_params: Dict[str, List[str]]) -> Dict[str, object]: - cpuinfo_text = _safe_read_text(env.proc / "cpuinfo") - logical_cpus = 0 - sockets: List[str] = [] - cores: List[Tuple[str, str]] = [] - smt_possible = False - if cpuinfo_text: - block: Dict[str, str] = {} - for line in cpuinfo_text.splitlines(): - if not line.strip(): - if block: - logical_cpus += 1 - physical_id = block.get("physical id", str(block.get("processor", logical_cpus - 1))) - core_id = block.get("core id", str(block.get("processor", logical_cpus - 1))) - sockets.append(physical_id) - cores.append((physical_id, core_id)) - siblings = block.get("siblings") - core_count = block.get("cpu cores") - if siblings and core_count: - try: - if int(siblings) > int(core_count): - smt_possible = True - except ValueError: - pass - block = {} - continue - if ":" in line: - key, value = line.split(":", 1) - block[key.strip()] = value.strip() - if block: - logical_cpus += 1 - physical_id = block.get("physical id", str(block.get("processor", logical_cpus - 1))) - core_id = block.get("core id", str(block.get("processor", logical_cpus - 1))) - sockets.append(physical_id) - cores.append((physical_id, core_id)) - siblings = block.get("siblings") - core_count = block.get("cpu cores") - if siblings and core_count: - try: - if int(siblings) > int(core_count): - smt_possible = True - except ValueError: - pass - - unique_sockets = sorted(set(sockets)) - unique_cores = sorted(set(cores)) - isolated_params = cmd_params.get("isolcpus", []) + cmd_params.get("tuned.isolcpus", []) - isolated_cpus: List[int] = [] - for value in isolated_params: - isolated_cpus.extend(_parse_cpu_list(value)) - nohz_full = [] - for value in cmd_params.get("nohz_full", []): - nohz_full.extend(_parse_cpu_list(value)) - tuned_non_isol = [] - for value in cmd_params.get("tuned.non_isolcpus", []): - tuned_non_isol.extend(_parse_cpu_list(value)) - - default_irq_affinity = _parse_cpu_mask(_safe_read_text(env.proc / "irq/default_smp_affinity") or "") - - recommendations: List[str] = [] - observations: List[str] = [] - if logical_cpus: - observations.append(f"Detected {logical_cpus} logical CPUs across {len(unique_sockets)} socket(s).") - if smt_possible: - observations.append("Hyper-Threading/SMT appears to be enabled (siblings > cpu cores).") - if isolated_cpus: - observations.append(f"Kernel cmdline isolates CPUs: {','.join(str(cpu) for cpu in isolated_cpus)}.") - else: - if logical_cpus >= 8: - recommendations.append( - "Configure `isolcpus` (or `tuned.non_isolcpus`) to reserve dedicated cores for workload isolation." - ) - if nohz_full and not isolated_cpus: - recommendations.append( - "`nohz_full` specified without matching `isolcpus`; verify scheduler isolation covers intended CPUs." - ) - if tuned_non_isol: - observations.append(f"Tuned non-isolated CPU mask: {','.join(str(cpu) for cpu in sorted(set(tuned_non_isol)))}.") - if default_irq_affinity and isolated_cpus: - overlap = sorted(set(default_irq_affinity) & set(isolated_cpus)) - if overlap: - recommendations.append( - f"Default IRQ affinity includes isolated CPUs ({','.join(map(str, overlap))}); adjust " - "`/proc/irq/default_smp_affinity` and tuned profiles to keep interrupts off dedicated cores." - ) - return { - "logical_cpus": logical_cpus, - "sockets": len(unique_sockets), - "physical_cores": len(unique_cores), - "smt_detected": smt_possible, - "isolated_cpus": sorted(set(isolated_cpus)), - "nohz_full": sorted(set(nohz_full)), - "tuned_non_isolcpus": sorted(set(tuned_non_isol)), - "default_irq_affinity": default_irq_affinity, - "observations": observations, - "recommendations": recommendations, - } - - -def gather_hugepage_info(env: EnvironmentPaths) -> Dict[str, object]: - meminfo_text = _safe_read_text(env.proc / "meminfo") - hugepages_total = None - hugepages_free = None - hugepages_rsvd = None - hugepages_surp = None - hugepage_size_kb = None - mem_total_kb = None - if meminfo_text: - for line in meminfo_text.splitlines(): - if line.startswith("HugePages_Total:"): - hugepages_total = int(line.split()[1]) - elif line.startswith("HugePages_Free:"): - hugepages_free = int(line.split()[1]) - elif line.startswith("HugePages_Rsvd:"): - hugepages_rsvd = int(line.split()[1]) - elif line.startswith("HugePages_Surp:"): - hugepages_surp = int(line.split()[1]) - elif line.startswith("Hugepagesize:"): - hugepage_size_kb = int(line.split()[1]) - elif line.startswith("MemTotal:"): - mem_total_kb = int(line.split()[1]) - sysctl_nr_hugepages = _safe_read_int(env.proc / "sys/vm/nr_hugepages") - sysctl_overcommit_huge = _safe_read_int(env.proc / "sys/vm/nr_overcommit_hugepages") - - per_node: Dict[str, Dict[str, int]] = {} - nodes_dir = env.sys / "devices/system/node" - if nodes_dir.is_dir(): - for node_dir in sorted(nodes_dir.iterdir()): - if not node_dir.name.startswith("node"): - continue - node_info: Dict[str, int] = {} - hugepages_dir = node_dir / "hugepages" - if hugepages_dir.is_dir(): - for hp_dir in hugepages_dir.iterdir(): - nr_path = hp_dir / "nr_hugepages" - free_path = hp_dir / "free_hugepages" - if nr_path.exists(): - node_info["total"] = node_info.get("total", 0) + int(nr_path.read_text().strip()) - if free_path.exists(): - node_info["free"] = node_info.get("free", 0) + int(free_path.read_text().strip()) - if node_info: - per_node[node_dir.name] = node_info - - recommendations: List[str] = [] - observations: List[str] = [] - if hugepages_total is not None: - observations.append(f"HugePages_Total={hugepages_total} (size={hugepage_size_kb or 'unknown'} KB).") - if hugepages_total == 0: - recommendations.append( - "Huge pages are disabled. Configure `vm.nr_hugepages` or MachineConfig/Tuned profiles if workloads require pinned memory." - ) - elif hugepages_free is not None and hugepages_free / max(hugepages_total, 1) < 0.1: - recommendations.append( - "Huge pages are nearly exhausted (free <10%). Increase the allocation cap or investigate consumption." - ) - if hugepages_rsvd: - observations.append(f"HugePages_Rsvd={hugepages_rsvd}.") - if mem_total_kb and hugepages_total and hugepage_size_kb: - provisioned_percent = (hugepages_total * hugepage_size_kb) / mem_total_kb * 100 - if provisioned_percent < 1: - recommendations.append( - "Huge page pool is <1% of system memory. Verify sizing matches workload requirements." - ) - if sysctl_nr_hugepages and hugepages_total and sysctl_nr_hugepages != hugepages_total: - observations.append( - f"Runtime HugePages_Total ({hugepages_total}) differs from sysctl target ({sysctl_nr_hugepages})." - ) - - return { - "hugepages_total": hugepages_total, - "hugepages_free": hugepages_free, - "hugepages_reserved": hugepages_rsvd, - "hugepages_surplus": hugepages_surp, - "hugepage_size_kb": hugepage_size_kb, - "sysctl_nr_hugepages": sysctl_nr_hugepages, - "sysctl_nr_overcommit": sysctl_overcommit_huge, - "per_node": per_node, - "observations": observations, - "recommendations": recommendations, - } - - -SYSCTL_CHECKS: List[Dict[str, object]] = [ - { - "path": "kernel/sched_rt_runtime_us", - "comparison": "eq", - "value": -1, - "message": "Set `kernel.sched_rt_runtime_us=-1` to allow realtime workloads full CPU bandwidth.", - }, - { - "path": "kernel/nmi_watchdog", - "comparison": "eq", - "value": 0, - "message": "Disable NMI watchdog (`kernel.nmi_watchdog=0`) on isolated/latency-sensitive nodes.", - }, - { - "path": "vm/swappiness", - "comparison": "lte", - "value": 10, - "message": "Lower `vm.swappiness` (<=10) to reduce swap pressure on performance nodes.", - }, - { - "path": "vm/zone_reclaim_mode", - "comparison": "eq", - "value": 0, - "message": "Ensure `vm.zone_reclaim_mode=0` unless targeting NUMA-local reclaim.", - }, - { - "path": "net/core/netdev_max_backlog", - "comparison": "gte", - "value": 32768, - "message": "Increase `net.core.netdev_max_backlog` (>=32768) to accommodate bursty NIC traffic.", - }, - { - "path": "net/core/somaxconn", - "comparison": "gte", - "value": 1024, - "message": "Increase `net.core.somaxconn` (>=1024) to avoid listen queue overflows.", - }, - { - "path": "net/ipv4/tcp_tw_reuse", - "comparison": "eq", - "value": 1, - "message": "Enable `net.ipv4.tcp_tw_reuse=1` for faster TIME-WAIT socket reuse.", - }, - { - "path": "net/ipv4/tcp_fin_timeout", - "comparison": "lte", - "value": 30, - "message": "Reduce `net.ipv4.tcp_fin_timeout` (<=30) to shorten FIN-WAIT-2 linger.", - }, - { - "path": "net/ipv4/tcp_rmem", - "comparison": "triplet_min", - "value": (4096, 87380, 16777216), - "message": "Grow `net.ipv4.tcp_rmem` (recommended min/def/max >= 4096/87380/16777216).", - }, - { - "path": "net/ipv4/tcp_wmem", - "comparison": "triplet_min", - "value": (4096, 65536, 16777216), - "message": "Grow `net.ipv4.tcp_wmem` (recommended min/def/max >= 4096/65536/16777216).", - }, -] - - -def gather_sysctl_info(env: EnvironmentPaths) -> Dict[str, object]: - results: Dict[str, Dict[str, object]] = {} - recommendations: List[str] = [] - observations: List[str] = [] - for check in SYSCTL_CHECKS: - path = env.proc / "sys" / Path(str(check["path"])) - value_text = _safe_read_text(path) - if value_text is None: - continue - normalized = value_text.strip() - results[str(check["path"])] = {"value": normalized} - comparison = str(check["comparison"]) - target = check["value"] - try: - if comparison == "eq": - actual_int = int(normalized) - if actual_int != int(target): - recommendations.append(str(check["message"])) - elif comparison == "lte": - actual_int = int(normalized) - if actual_int > int(target): - recommendations.append(str(check["message"])) - elif comparison == "gte": - actual_int = int(normalized) - if actual_int < int(target): - recommendations.append(str(check["message"])) - elif comparison == "triplet_min": - actual_parts = [int(part) for part in normalized.split()] - target_parts = list(target) if isinstance(target, (list, tuple)) else [] - if len(actual_parts) >= 3 and len(target_parts) >= 3: - for idx in range(3): - if actual_parts[idx] < target_parts[idx]: - recommendations.append(str(check["message"])) - break - else: - observations.append(f"Unhandled comparison type '{comparison}' for {check['path']}.") - except ValueError: - observations.append(f"Non-integer sysctl value for {check['path']}: '{normalized}'.") - - thp_enabled = _safe_read_text(env.sys / "kernel/mm/transparent_hugepage/enabled") - if thp_enabled: - results["sys.kernel.mm.transparent_hugepage.enabled"] = {"value": thp_enabled.strip()} - if "[never]" not in thp_enabled: - recommendations.append( - "Transparent Hugepages are not disabled (`[never]` not selected). Consider setting to `never` for latency-sensitive workloads." - ) - - thp_defrag = _safe_read_text(env.sys / "kernel/mm/transparent_hugepage/defrag") - if thp_defrag: - results["sys.kernel.mm.transparent_hugepage.defrag"] = {"value": thp_defrag.strip()} - if "[never]" not in thp_defrag and "[madvise]" not in thp_defrag: - recommendations.append( - "Transparent Hugepage defrag is aggressive. Set to `never` or `madvise` to reduce allocation jitter." - ) - - return { - "values": results, - "observations": observations, - "recommendations": recommendations, - } - - -def _parse_netstat_file(path: Path) -> Dict[str, Dict[str, int]]: - text = _safe_read_text(path) - if not text: - return {} - lines = [line.strip() for line in text.splitlines() if line.strip()] - parsed: Dict[str, Dict[str, int]] = {} - idx = 0 - while idx + 1 < len(lines): - header = lines[idx].split() - values = lines[idx + 1].split() - if not header or not values: - idx += 2 - continue - section = header[0].rstrip(":") - metrics: Dict[str, int] = {} - for key, value in zip(header[1:], values[1:]): - try: - metrics[key] = int(value) - except ValueError: - continue - parsed[section] = metrics - idx += 2 - return parsed - - -def _parse_sockstat(path: Path) -> Dict[str, Dict[str, int]]: - text = _safe_read_text(path) - if not text: - return {} - parsed: Dict[str, Dict[str, int]] = {} - for line in text.splitlines(): - if ":" not in line: - continue - section, rest = line.split(":", 1) - metrics: Dict[str, int] = {} - parts = rest.split() - for idx in range(0, len(parts), 2): - key = parts[idx] - if idx + 1 >= len(parts): - break - value = parts[idx + 1] - try: - metrics[key] = int(value) - except ValueError: - continue - parsed[section.strip()] = metrics - return parsed - - -def gather_network_info(env: EnvironmentPaths) -> Dict[str, object]: - netstat_data = _parse_netstat_file(env.proc / "net/netstat") - snmp_data = _parse_netstat_file(env.proc / "net/snmp") - sockstat_data = _parse_sockstat(env.proc / "net/sockstat") - recommendations: List[str] = [] - observations: List[str] = [] - - tcp_ext = netstat_data.get("TcpExt", {}) - listen_drops = tcp_ext.get("ListenDrops") - backlog_drops = tcp_ext.get("TCPBacklogDrop") - aborted_listens = tcp_ext.get("TCPAbortOnListen") - syncookies_failed = tcp_ext.get("SyncookiesFailed") - if listen_drops and listen_drops > 0: - recommendations.append( - f"Detected {listen_drops} TCP listen drops. Increase `net.core.somaxconn` and review application accept loops." - ) - if backlog_drops and backlog_drops > 0: - recommendations.append( - f"Detected {backlog_drops} TCP backlog drops. Increase `net.core.netdev_max_backlog` / `somaxconn` and tune application backlog." - ) - if aborted_listens and aborted_listens > 0: - observations.append(f"{aborted_listens} connections aborted on listen; investigate SYN flood or backlog exhaustion.") - if syncookies_failed and syncookies_failed > 0: - recommendations.append( - f"Syncookies failures observed ({syncookies_failed}); validate NIC offload settings and SYN cookie limits." - ) - - tcp_sockstat = sockstat_data.get("TCP", {}) - if tcp_sockstat: - in_use = tcp_sockstat.get("inuse") - orphan = tcp_sockstat.get("orphan") - if orphan and in_use and orphan > max(1, in_use // 10): - recommendations.append( - f"High orphaned TCP socket count ({orphan}) relative to in-use sockets ({in_use}). Tune FIN timeout and monitor retransmits." - ) - - return { - "netstat": netstat_data, - "snmp": snmp_data, - "sockstat": sockstat_data, - "observations": observations, - "recommendations": recommendations, - } - - -def gather_irq_affinity_info( - env: EnvironmentPaths, - isolated_cpus: Sequence[int], - *, - max_samples: int, -) -> Dict[str, object]: - isolated_set = set(isolated_cpus) - irq_dir = env.proc / "irq" - mismatches: List[Dict[str, object]] = [] - total_irqs_checked = 0 - if irq_dir.is_dir(): - for entry in sorted(irq_dir.iterdir(), key=lambda p: p.name): - if not entry.name.isdigit(): - continue - total_irqs_checked += 1 - effective = _safe_read_text(entry / "effective_affinity_list") - if effective is None: - effective = _safe_read_text(entry / "smp_affinity_list") - if effective is None: - continue - effective_cpus = _parse_cpu_list(effective.strip()) - if isolated_set and any(cpu in isolated_set for cpu in effective_cpus): - desc = _safe_read_text(entry / "actions") or _safe_read_text(entry / "spurious") - if desc: - desc = desc.strip().splitlines()[0] - mismatches.append( - { - "irq": entry.name, - "cpus": effective_cpus, - "detail": desc, - } - ) - recommendations: List[str] = [] - if mismatches: - sample_count = min(len(mismatches), max_samples) - recommendations.append( - f"{len(mismatches)} IRQs overlap isolated CPUs. Relocate interrupt affinities using tuned profiles or `irqbalance` (showing {sample_count})." - ) - return { - "total_irqs_checked": total_irqs_checked, - "isolated_cpu_overlaps": mismatches[:max_samples], - "recommendations": recommendations, - } - - -def gather_process_summary(env: EnvironmentPaths) -> Dict[str, object]: - # Prefer sosreport process snapshot for richer context. - ps_snapshot: Optional[Path] = None - if env.sos_commands: - candidates = [ - env.sos_commands / "process/ps_auxwww", - env.sos_commands / "process" / "ps_auxwww", - env.sos_commands / "process" / "ps_auxwww_-www", - ] - for candidate in candidates: - if candidate.exists(): - ps_snapshot = candidate - break - if ps_snapshot is None: - return {"top_processes": [], "recommendations": []} - text = _safe_read_text(ps_snapshot) - if not text: - return {"top_processes": [], "recommendations": []} - lines = [line for line in text.splitlines() if line.strip()] - _header = lines[0] # Header row, not needed for parsing - processes: List[Dict[str, str]] = [] - for line in lines[1:]: - parts = line.split(None, 10) - if len(parts) < 11: - continue - _user, pid, cpu, mem, _vsz, _rss, _tty, _stat, _start, _time, command = parts - processes.append( - { - "pid": pid, - "cpu_percent": cpu, - "mem_percent": mem, - "command": command.strip(), - } - ) - processes.sort(key=lambda entry: float(entry.get("cpu_percent", "0") or "0"), reverse=True) - top_processes = processes[:10] - recommendations: List[str] = [] - for proc in top_processes: - if "irqbalance" in proc["command"]: - recommendations.append( - "Verify irqbalance configuration excludes isolated CPUs (saw irqbalance among top processes)." - ) - break - return { - "top_processes": top_processes, - "recommendations": recommendations, - } - - -def assemble_report(env: EnvironmentPaths, max_irq_samples: int) -> Dict[str, object]: - system_info = gather_system_info(env) - cpu_info = gather_cpu_info(env, system_info.get("kernel_cmdline_params", {})) - hugepage_info = gather_hugepage_info(env) - sysctl_info = gather_sysctl_info(env) - network_info = gather_network_info(env) - irq_info = gather_irq_affinity_info( - env, - cpu_info.get("isolated_cpus", []), - max_samples=max_irq_samples, - ) - process_info = gather_process_summary(env) - - recommendations: List[str] = [] - sections = [cpu_info, hugepage_info, sysctl_info, network_info, irq_info, process_info] - for section in sections: - recommendations.extend(section.get("recommendations", [])) # type: ignore[arg-type] - unique_recommendations = sorted(set(rec.strip() for rec in recommendations if rec.strip())) - - return { - "system": system_info, - "cpu": cpu_info, - "hugepages": hugepage_info, - "sysctl": sysctl_info, - "network": network_info, - "irq_affinity": irq_info, - "processes": process_info, - "recommendations": unique_recommendations, - } - - -def format_markdown(report: Dict[str, object]) -> str: - lines: List[str] = [] - system = report["system"] # type: ignore[assignment] - cpu = report["cpu"] # type: ignore[assignment] - hugepages = report["hugepages"] # type: ignore[assignment] - sysctl = report["sysctl"] # type: ignore[assignment] - network = report["network"] # type: ignore[assignment] - irq = report["irq_affinity"] # type: ignore[assignment] - processes = report["processes"] # type: ignore[assignment] - - lines.append("# Node Tuning Analysis") - lines.append("") - lines.append("## System Overview") - lines.append(f"- Hostname: {system.get('hostname') or 'unknown'}") - lines.append(f"- Kernel: {system.get('kernel_release') or 'unknown'}") - lines.append(f"- NUMA nodes: {system.get('numa_nodes')}") - cmdline = system.get("kernel_cmdline") or "" - if cmdline: - lines.append(f"- Kernel cmdline: `{cmdline}`") - uptime = system.get("uptime_seconds") - if uptime is not None: - lines.append(f"- Uptime: {uptime:.0f} seconds") - - lines.append("") - lines.append("## CPU & Isolation") - lines.append(f"- Logical CPUs: {cpu.get('logical_cpus')}") - lines.append(f"- Physical cores: {cpu.get('physical_cores')} across {cpu.get('sockets')} socket(s)") - lines.append(f"- SMT detected: {'yes' if cpu.get('smt_detected') else 'no'}") - if cpu.get("isolated_cpus"): - lines.append(f"- Isolated CPUs: {','.join(str(v) for v in cpu['isolated_cpus'])}") # type: ignore[index] - if cpu.get("nohz_full"): - lines.append(f"- nohz_full CPUs: {','.join(str(v) for v in cpu['nohz_full'])}") # type: ignore[index] - if cpu.get("tuned_non_isolcpus"): - lines.append( - f"- tuned.non_isolcpus: {','.join(str(v) for v in cpu['tuned_non_isolcpus'])}" # type: ignore[index] - ) - for obs in cpu.get("observations", []): - lines.append(f"- {obs}") - - lines.append("") - lines.append("## Huge Pages") - lines.append(f"- Total: {hugepages.get('hugepages_total')} (size={hugepages.get('hugepage_size_kb')} KB)") - lines.append(f"- Free: {hugepages.get('hugepages_free')}, Reserved: {hugepages.get('hugepages_reserved')}") - if hugepages.get("per_node"): - per_node = hugepages["per_node"] # type: ignore[assignment] - node_summaries = [] - for node, values in per_node.items(): - node_summaries.append(f"{node}:total={values.get('total',0)}/free={values.get('free',0)}") - lines.append(f"- Per NUMA node: {', '.join(node_summaries)}") - for obs in hugepages.get("observations", []): - lines.append(f"- {obs}") - - lines.append("") - lines.append("## Sysctl Highlights") - for key, info in sorted(sysctl.get("values", {}).items()): # type: ignore[call-arg] - lines.append(f"- {key}: {info.get('value')}") - for obs in sysctl.get("observations", []): - lines.append(f"- {obs}") - - lines.append("") - lines.append("## Network Signals") - tcp_ext = network.get("netstat", {}).get("TcpExt", {}) # type: ignore[index] - if tcp_ext: - lines.append( - "- TcpExt counters: " - + ", ".join(f"{key}={value}" for key, value in list(tcp_ext.items())[:8]) - ) - tcp_sock = network.get("sockstat", {}).get("TCP", {}) # type: ignore[index] - if tcp_sock: - lines.append("- Sockstat TCP: " + ", ".join(f"{k}={v}" for k, v in tcp_sock.items())) - for obs in network.get("observations", []): - lines.append(f"- {obs}") - - lines.append("") - lines.append("## IRQ Affinity") - lines.append(f"- IRQs inspected: {irq.get('total_irqs_checked')}") - overlaps = irq.get("isolated_cpu_overlaps", []) - if overlaps: - lines.append(f"- IRQs overlapping isolated CPUs: {len(overlaps)}") - for entry in overlaps: - lines.append( - f" - IRQ {entry.get('irq')}: CPUs {','.join(str(cpu) for cpu in entry.get('cpus', []))}" - ) - else: - lines.append("- No IRQ affinity overlaps with isolated CPUs detected.") - - process_list = processes.get("top_processes", []) - if process_list: - lines.append("") - lines.append("## Process Snapshot (top by %CPU)") - for proc in process_list[:5]: - lines.append( - f"- PID {proc['pid']}: {proc['cpu_percent']}% CPU, {proc['mem_percent']}% MEM, cmd='{proc['command']}'" - ) - - recommendations = report.get("recommendations", []) - if recommendations: - lines.append("") - lines.append("## Recommended Actions") - for rec in recommendations: - lines.append(f"- {rec}") - - return "\n".join(lines) + "\n" - - -def main(argv: Sequence[str]) -> int: - args = parse_arguments(argv) - try: - env = resolve_environment(args) - report = assemble_report(env, max_irq_samples=args.max_irq_samples) - except (FileNotFoundError, ValueError) as exc: - print(f"error: {exc}", file=sys.stderr) - return 1 - - if args.format == "json": - output = json.dumps(report, indent=2) - else: - output = format_markdown(report) - - if args.output: - output_path = Path(args.output).expanduser() - if output_path.parent and not output_path.parent.exists(): - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(output, encoding="utf-8") - print(f"Wrote analysis report to {output_path}") - else: - sys.stdout.write(output) - return 0 - - -if __name__ == "__main__": - raise SystemExit(main(sys.argv[1:])) - - diff --git a/plugins/node-tuning/skills/scripts/generate_tuned_profile.py b/plugins/node-tuning/skills/scripts/generate_tuned_profile.py deleted file mode 100644 index fd4f3d8fe..000000000 --- a/plugins/node-tuning/skills/scripts/generate_tuned_profile.py +++ /dev/null @@ -1,414 +0,0 @@ -""" -Utility script to generate tuned.openshift.io/v1 Tuned manifests. - -The script is intentionally dependency-free so it can run anywhere Python 3.8+ -is available (CI, developer workstations, or automation pipelines). -""" - -from __future__ import annotations - -import argparse -import json -import os -import subprocess -import sys -from collections import OrderedDict -from typing import Iterable, List, Optional, Sequence, Tuple - - -def _parse_key_value_pairs( - raw_values: Sequence[str], - *, - parameter: str, - allow_empty_value: bool = False, -) -> List[Tuple[str, str]]: - """Split KEY=VALUE (or KEY when allow_empty_value=True) pairs.""" - parsed: List[Tuple[str, str]] = [] - for raw in raw_values: - if "=" in raw: - key, value = raw.split("=", 1) - elif allow_empty_value: - key, value = raw, "" - else: - raise ValueError(f"{parameter} expects KEY=VALUE entries, got '{raw}'") - key = key.strip() - value = value.strip() - if not key: - raise ValueError(f"{parameter} entries must include a non-empty key (got '{raw}')") - parsed.append((key, value)) - return parsed - - -def _parse_section_entries(raw_values: Sequence[str]) -> List[Tuple[str, str, str]]: - """ - Parse SECTION:KEY=VALUE entries for arbitrary tuned.ini sections. - - Examples: - bootloader:cmdline_ocp_realtime=+nohz_full=1-3 - service:service.stalld=start,enable - """ - parsed: List[Tuple[str, str, str]] = [] - for raw in raw_values: - if ":" not in raw: - raise ValueError( - f"--section expects SECTION:KEY=VALUE entries, got '{raw}'" - ) - section, remainder = raw.split(":", 1) - section = section.strip() - if not section: - raise ValueError(f"--section requires a section name before ':', got '{raw}'") - key_value = _parse_key_value_pairs([remainder], parameter="--section") - parsed.append((section, key_value[0][0], key_value[0][1])) - return parsed - - -def _build_profile_ini( - *, - summary: str, - includes: Sequence[str], - main_options: Sequence[Tuple[str, str]], - variables: Sequence[Tuple[str, str]], - sysctls: Sequence[Tuple[str, str]], - extra_sections: Sequence[Tuple[str, str, str]], -) -> str: - sections: "OrderedDict[str, List[str]]" = OrderedDict() - sections["main"] = [f"summary={summary}"] - if includes: - sections["main"].append(f"include={','.join(includes)}") - for key, value in main_options: - sections["main"].append(f"{key}={value}") - - if variables: - sections["variables"] = [f"{key}={value}" for key, value in variables] - if sysctls: - sections["sysctl"] = [f"{key}={value}" for key, value in sysctls] - - for section, key, value in extra_sections: - section = section.strip() - if not section: - continue - if section not in sections: - sections[section] = [] - sections[section].append(f"{key}={value}") - - rendered_sections: List[str] = [] - non_empty_sections = [(name, lines) for name, lines in sections.items() if lines] - for idx, (name, lines) in enumerate(non_empty_sections): - rendered_sections.append(f"[{name}]") - rendered_sections.extend(lines) - if idx != len(non_empty_sections) - 1: - rendered_sections.append("") - return "\n".join(rendered_sections) - - -def _json_string(value: str) -> str: - """Return a JSON-encoded string (adds surrounding quotes, escapes).""" - return json.dumps(value) - - -def _render_manifest( - *, - profile_name: str, - namespace: str, - profile_ini: str, - machine_config_labels: Sequence[Tuple[str, str]], - match_labels: Sequence[Tuple[str, str]], - priority: int, -) -> str: - lines: List[str] = [ - "apiVersion: tuned.openshift.io/v1", - "kind: Tuned", - "metadata:", - f" name: {profile_name}", - ] - if namespace: - lines.append(f" namespace: {namespace}") - lines.extend( - [ - "spec:", - " profile:", - " - data: |", - ] - ) - profile_lines = profile_ini.splitlines() - if not profile_lines: - raise ValueError("Profile contents may not be empty") - for entry in profile_lines: - # Preserve blank lines for readability inside the literal block. - if entry: - lines.append(f" {entry}") - else: - lines.append(" ") - lines.append(f" name: {profile_name}") - - if not machine_config_labels and not match_labels: - raise ValueError("At least one --machine-config-label or --match-label must be provided") - - lines.append(" recommend:") - - if machine_config_labels: - lines.append(" - machineConfigLabels:") - for key, value in machine_config_labels: - lines.append(f" {key}: {_json_string(value)}") - start_written = True - else: - start_written = False - - if match_labels: - prefix = " match:" if start_written else " - match:" - lines.append(prefix) - item_indent = " - " if start_written else " - " - value_indent = " " if start_written else " " - for label, value in match_labels: - lines.append(f"{item_indent}label: {_json_string(label)}") - if value != "": - lines.append(f"{value_indent}value: {_json_string(value)}") - start_written = True - - priority_prefix = " priority" if start_written else " - priority" - lines.append(f"{priority_prefix}: {priority}") - - profile_prefix = " profile" if start_written else " - profile" - lines.append(f"{profile_prefix}: {_json_string(profile_name)}") - - return "\n".join(lines) + "\n" - - -def _run_oc_command(command: Sequence[str]) -> subprocess.CompletedProcess: - """Execute an oc command and return the completed process.""" - try: - result = subprocess.run( - command, - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - except FileNotFoundError as exc: - raise RuntimeError( - "Unable to locate the 'oc' binary. Install the OpenShift CLI or set --oc-binary." - ) from exc - except subprocess.CalledProcessError as exc: - message = exc.stderr.strip() or exc.stdout.strip() or str(exc) - raise RuntimeError(f"Command '{' '.join(command)}' failed: {message}") from exc - return result - - -def list_nodes(*, oc_binary: str, selector: Optional[str]) -> List[str]: - """List nodes using the oc CLI and return their names.""" - command: List[str] = [oc_binary, "get", "nodes", "-o", "name"] - if selector: - command.extend(["-l", selector]) - result = _run_oc_command(command) - nodes = [line.strip() for line in result.stdout.splitlines() if line.strip()] - if nodes: - for node in nodes: - print(node) - else: - print("No nodes matched the provided selector.") - return nodes - - -def label_nodes( - *, - oc_binary: str, - entries: Sequence[str], - overwrite: bool, -) -> None: - """Label nodes via oc CLI using NODE:label entries.""" - if not entries: - return - for raw in entries: - if ":" not in raw: - raise ValueError( - f"--label-node expects NODE:KEY[=VALUE] format (e.g. node1:node-role.kubernetes.io/worker-hp=) - got '{raw}'" - ) - node_name, label = raw.split(":", 1) - node_name = node_name.strip() - label = label.strip() - if not node_name or not label: - raise ValueError(f"--label-node entry must include both node name and label (got '{raw}')") - command: List[str] = [oc_binary, "label", "node", node_name, label] - if overwrite: - command.append("--overwrite") - _run_oc_command(command) - print(f"Labeled {node_name} with {label}") - - -def generate_manifest(args: argparse.Namespace) -> str: - includes = [value.strip() for value in args.include or [] if value.strip()] - - main_options = _parse_key_value_pairs(args.main_option or [], parameter="--main-option") - variables = _parse_key_value_pairs(args.variable or [], parameter="--variable") - sysctls = _parse_key_value_pairs(args.sysctl or [], parameter="--sysctl") - extra_sections = _parse_section_entries(args.section or []) - - match_labels = _parse_key_value_pairs( - args.match_label or [], - parameter="--match-label", - allow_empty_value=True, - ) - machine_config_labels = _parse_key_value_pairs( - args.machine_config_label or [], - parameter="--machine-config-label", - allow_empty_value=True, - ) - - profile_ini = _build_profile_ini( - summary=args.summary, - includes=includes, - main_options=main_options, - variables=variables, - sysctls=sysctls, - extra_sections=extra_sections, - ) - - manifest = _render_manifest( - profile_name=args.profile_name, - namespace=args.namespace, - profile_ini=profile_ini, - machine_config_labels=machine_config_labels, - match_labels=match_labels, - priority=args.priority, - ) - return manifest - - -def parse_arguments(argv: Iterable[str]) -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Generate tuned.openshift.io/v1 Tuned manifests for the Node Tuning Operator.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument("--profile-name", help="Name of the Tuned profile and resource") - parser.add_argument("--summary", help="Summary placed inside the [main] section") - parser.add_argument( - "--namespace", - default="openshift-cluster-node-tuning-operator", - help="Namespace to place in metadata.namespace", - ) - parser.add_argument( - "--include", - action="append", - help="Append an entry to the 'include=' list (multiple flags allowed)", - ) - parser.add_argument( - "--main-option", - action="append", - help="Add KEY=VALUE to the [main] section beyond summary/include", - ) - parser.add_argument( - "--variable", - action="append", - help="Add KEY=VALUE to the [variables] section", - ) - parser.add_argument( - "--sysctl", - action="append", - help="Add KEY=VALUE to the [sysctl] section", - ) - parser.add_argument( - "--section", - action="append", - help="Add arbitrary SECTION:KEY=VALUE lines (e.g. bootloader:cmdline=...)", - ) - parser.add_argument( - "--machine-config-label", - action="append", - help="Add a MachineConfigPool selector (key=value) under machineConfigLabels", - ) - parser.add_argument( - "--match-label", - action="append", - help="Add a node label entry (key[=value]) under recommend[].match[]", - ) - parser.add_argument( - "--priority", - type=int, - default=20, - help="Recommendation priority", - ) - parser.add_argument( - "--output", - help="Output file path; defaults to .yaml in the current directory", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print manifest to stdout instead of writing to disk", - ) - parser.add_argument( - "--skip-manifest", - action="store_true", - help="Skip manifest generation; useful when only listing or labeling nodes", - ) - parser.add_argument( - "--list-nodes", - action="store_true", - help="List nodes via 'oc get nodes' before other actions", - ) - parser.add_argument( - "--node-selector", - help="Label selector to filter nodes when using --list-nodes", - ) - parser.add_argument( - "--label-node", - action="append", - help="Label nodes using NODE:KEY[=VALUE] entries (repeat for multiple nodes)", - ) - parser.add_argument( - "--overwrite-labels", - action="store_true", - help="Allow overwriting existing labels when using --label-node", - ) - parser.add_argument( - "--oc-binary", - default=os.environ.get("OC_BIN", "oc"), - help="Path to the oc binary to execute", - ) - return parser.parse_args(argv) - - -def main(argv: Sequence[str]) -> int: - args = parse_arguments(argv) - try: - if args.list_nodes: - list_nodes(oc_binary=args.oc_binary, selector=args.node_selector) - - if args.label_node: - label_nodes( - oc_binary=args.oc_binary, - entries=args.label_node, - overwrite=args.overwrite_labels, - ) - - if args.skip_manifest: - return 0 - - if not args.profile_name: - raise ValueError("--profile-name is required unless --skip-manifest is set") - if not args.summary: - raise ValueError("--summary is required unless --skip-manifest is set") - - manifest = generate_manifest(args) - except (ValueError, RuntimeError) as exc: - print(f"error: {exc}", file=sys.stderr) - return 1 - - if args.dry_run: - sys.stdout.write(manifest) - return 0 - - output_path = args.output or f"{args.profile_name}.yaml" - output_dir = os.path.dirname(output_path) - if output_dir and not os.path.exists(output_dir): - os.makedirs(output_dir, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as handle: - handle.write(manifest) - print(f"Wrote Tuned manifest to {output_path}") - return 0 - - -if __name__ == "__main__": - raise SystemExit(main(sys.argv[1:])) - diff --git a/plugins/node/.claude-plugin/plugin.json b/plugins/node/.claude-plugin/plugin.json deleted file mode 100644 index 3d05be04a..000000000 --- a/plugins/node/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "node", - "description": "Kubernetes and OpenShift node health monitoring and diagnostics", - "version": "0.0.2", - "author": { - "name": "github.com/openshift-eng" - } -} diff --git a/plugins/node/README.md b/plugins/node/README.md deleted file mode 100644 index a50dc186f..000000000 --- a/plugins/node/README.md +++ /dev/null @@ -1,158 +0,0 @@ -# Node Plugin - -Kubernetes and OpenShift node health monitoring and diagnostics. - -## Overview - -The Node plugin provides comprehensive health checking and diagnostic capabilities for Kubernetes and OpenShift cluster nodes. It automates the inspection of node-level components including kubelet, CRI-O container runtime, system resources, and node conditions to ensure nodes are functioning properly. - -## Commands - -### `/node:cluster-node-health-check` - -Perform comprehensive health check on cluster nodes and report kubelet, CRI-O, and node-level issues. - -**Usage:** -```bash -/node:cluster-node-health-check [--node ] [--verbose] [--output-format json|text] -``` - -**Arguments:** -- `--node ` (optional): Name of a specific node to check. If not provided, checks all nodes in the cluster. -- `--verbose` (optional): Enable detailed output with additional context, including resource-level details, warning conditions, and remediation suggestions. -- `--output-format` (optional): Output format for results (`text` or `json`). Defaults to `text`. - -**Examples:** - -Check all nodes in the cluster: -```bash -/node:cluster-node-health-check -``` - -Check a specific node: -```bash -/node:cluster-node-health-check --node worker-1 -``` - -Verbose output with detailed diagnostics: -```bash -/node:cluster-node-health-check --verbose -``` - -JSON output for automation: -```bash -/node:cluster-node-health-check --output-format json -``` - -**What it checks:** - -1. **Node Status and Conditions** - - Ready status - - MemoryPressure, DiskPressure, PIDPressure - - NetworkUnavailable condition - - Node taints and scheduling constraints - -2. **Kubelet Service Health** - - Service status and restart counts - - Certificate validity - - Configuration issues - -3. **CRI-O Container Runtime** - - Runtime service status - - Container operation errors - - Version compatibility - -4. **Resource Utilization** - - CPU and memory allocation - - Disk space usage - - Pod count vs capacity - - Ephemeral storage - -5. **System Services** - - Critical daemon status (kubelet, crio) - - Failed systemd units - -6. **Kernel Parameters** - - Key sysctl settings for Kubernetes - - SELinux status - -7. **Pod Health on Nodes** - - Running, pending, and failed pods - - High restart counts - - Resource pressure impact - -8. **Recent Events** - - Warning events for nodes - - Pod events on nodes - -**Output:** - -The command provides: -- Overall health status (Healthy ✅ / Warning ⚠️ / Critical ❌) -- Detailed findings for each node -- Specific issues with severity levels -- Impact assessment -- Recommended remediation actions -- Diagnostic commands for further investigation - -See [commands/cluster-node-health-check.md](commands/cluster-node-health-check.md) for detailed documentation. - -## Prerequisites - -- **Kubernetes/OpenShift CLI**: Either `oc` or `kubectl` must be installed -- **Active cluster connection**: Must be connected to a running cluster -- **Sufficient permissions**: Read access to nodes and pods, ability to create debug pods for node-level inspection - -## Use Cases - -- **Pre-deployment validation**: Verify node health before deploying applications -- **Troubleshooting**: Diagnose node-related issues affecting workload performance -- **Capacity planning**: Understand resource utilization across nodes -- **Proactive monitoring**: Regular health checks to catch issues early -- **Post-upgrade validation**: Verify node health after cluster upgrades -- **CI/CD integration**: Automated node health verification in pipelines - -## Common Issues Detected - -The plugin can detect and report: - -- Nodes in NotReady state -- Kubelet service failures or frequent restarts -- CRI-O runtime errors -- Memory or disk pressure conditions -- Network unavailability -- High pod restart counts -- Resource exhaustion (CPU, memory, disk) -- Failed system services -- Certificate expiration warnings -- Scheduling constraints (taints, labels) - -## Installation - -### From Marketplace - -```bash -# Add the ai-helpers marketplace -/plugin marketplace add openshift-eng/ai-helpers - -# Install the node plugin -/plugin install node@ai-helpers -``` - -### Manual Installation - -```bash -# Clone the repository -git clone https://github.com/openshift-eng/ai-helpers.git - -# Link to Claude Code plugins directory -ln -s $(pwd)/ai-helpers/plugins/node ~/.claude/plugins/node -``` - -## Contributing - -Contributions are welcome! Please see the main [CLAUDE.md](../../CLAUDE.md) for plugin development guidelines. - -## License - -Apache License 2.0 - See [LICENSE](../../LICENSE) for details. diff --git a/plugins/node/commands/cluster-node-health-check.md b/plugins/node/commands/cluster-node-health-check.md deleted file mode 100644 index 7312cd107..000000000 --- a/plugins/node/commands/cluster-node-health-check.md +++ /dev/null @@ -1,760 +0,0 @@ ---- -description: Perform comprehensive health check on cluster nodes and report kubelet, CRI-O, and node-level issues -argument-hint: "[--node ] [--verbose] [--output-format json|text]" ---- - -## Name -node:cluster-node-health-check - -## Synopsis - -``` -/node:cluster-node-health-check [--node ] [--verbose] [--output-format json|text] -``` - -## Description - -The `/node:cluster-node-health-check` command performs an extensive diagnostic of Kubernetes/OpenShift cluster nodes to assess their operational health, stability, and performance. It automates the validation of node-level components including kubelet, CRI-O container runtime, system resources, and node conditions to ensure nodes are functioning as expected. - -The command runs a comprehensive set of health checks covering node status, kubelet health, container runtime (CRI-O) operations, resource utilization, system daemons, and kernel parameters. It also detects degraded states, disk/memory pressure, network issues, and recent warning events. - -Specifically, it performs the following: - -- Detects and validates the availability of oc or kubectl CLI tools and verifies cluster connectivity. -- Checks all node statuses (Ready, MemoryPressure, DiskPressure, PIDPressure, NetworkUnavailable) and reports any abnormal conditions with detailed reasons and messages. -- Validates kubelet service health on each node, detecting service failures, high restart counts, and configuration issues. -- Performs CRI-O runtime health checks to ensure container operations are functioning correctly. -- Inspects resource utilization including CPU, memory, disk space, and process/pod counts against allocatable resources. -- Evaluates system daemon health (systemd services) critical for node operations. -- Examines kernel parameters and system tunables relevant to Kubernetes operations. -- Scans for recent warning events at the node level and for pods running on nodes. -- Reviews certificate validity for kubelet client certificates. -- Identifies node taints, labels, and scheduling constraints that may affect workload placement. -- Generates a clear, color-coded summary report and optionally exports findings in JSON format for automation or CI integration. - -## Prerequisites - -Before using this command, ensure you have: - -1. **Kubernetes/OpenShift CLI**: Either `oc` (OpenShift) or `kubectl` (Kubernetes) - - Install `oc` from: - - Or install `kubectl` from: - - Verify with: `oc version` or `kubectl version` - -2. **Active cluster connection**: Must be connected to a running cluster - - Verify with: `oc whoami` or `kubectl cluster-info` - - Ensure KUBECONFIG is set if needed - -3. **Sufficient permissions**: Must have read access to cluster resources - - Cluster-admin or monitoring role recommended for comprehensive checks - - Minimum: ability to view nodes, pods, and node metrics - - For node debugging (accessing journalctl, crictl): ability to create debug pods or ssh access - -## Arguments - -- **--node** (optional): Name of a specific node to check. If not provided, checks all nodes in the cluster. Example: `--node ip-10-0-1-23.ec2.internal` - -- **--verbose** (optional): Enable detailed output with additional context - - Shows resource-level details - - Includes warning conditions - - Provides remediation suggestions - -- **--output-format** (optional): Output format for results - - `text` (default): Human-readable text format - - `json`: Machine-readable JSON format for automation - -## Implementation - -The command performs the following health checks: - -### 1. Determine CLI Tool and Verify Connectivity - -Detect which Kubernetes CLI is available and verify cluster connection: - -```bash -if command -v oc &> /dev/null; then - CLI="oc" -elif command -v kubectl &> /dev/null; then - CLI="kubectl" -else - echo "Error: Neither 'oc' nor 'kubectl' CLI found. Please install one of them." - exit 1 -fi - -# Verify cluster connectivity -if ! $CLI cluster-info &> /dev/null; then - echo "Error: Not connected to a cluster. Please configure your KUBECONFIG." - exit 1 -fi -``` - -### 2. Initialize Health Check Report - -Create a report structure to collect findings: - -```bash -NODE_FILTER=${NODE_FILTER:-""} -VERBOSE=${VERBOSE:-false} -OUTPUT_FORMAT=${OUTPUT_FORMAT:-"text"} - -REPORT_FILE=".work/node-health-check/report-$(date +%Y%m%d-%H%M%S).txt" -mkdir -p .work/node-health-check - -# Initialize counters -CRITICAL_ISSUES=0 -WARNING_ISSUES=0 -INFO_MESSAGES=0 -``` - -### 3. Check Node Status and Conditions - -Verify node health and readiness: - -```bash -echo "Checking Node Status..." - -# Get all nodes or specific node -if [ -n "$NODE_FILTER" ]; then - NODES=$NODE_FILTER -else - NODES=$($CLI get nodes -o jsonpath='{.items[*].metadata.name}') -fi - -for node in $NODES; do - echo " Checking node: $node" - - # Check if node exists - if ! $CLI get node "$node" &> /dev/null; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Node '$node' not found" - continue - fi - - # Get node conditions - NODE_READY=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="Ready") | .status') - NODE_MEMORY_PRESSURE=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="MemoryPressure") | .status') - NODE_DISK_PRESSURE=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="DiskPressure") | .status') - NODE_PID_PRESSURE=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="PIDPressure") | .status') - NODE_NETWORK=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="NetworkUnavailable") | .status // "False"') - - if [ "$NODE_READY" != "True" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Node $node is not Ready" - $CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="Ready") | " Reason: \(.reason)\n Message: \(.message)"' - fi - - if [ "$NODE_MEMORY_PRESSURE" == "True" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Node $node has MemoryPressure" - $CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="MemoryPressure") | " Reason: \(.reason)\n Message: \(.message)"' - fi - - if [ "$NODE_DISK_PRESSURE" == "True" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Node $node has DiskPressure" - $CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="DiskPressure") | " Reason: \(.reason)\n Message: \(.message)"' - fi - - if [ "$NODE_PID_PRESSURE" == "True" ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Node $node has PIDPressure" - $CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="PIDPressure") | " Reason: \(.reason)\n Message: \(.message)"' - fi - - if [ "$NODE_NETWORK" == "True" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Node $node has NetworkUnavailable" - $CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="NetworkUnavailable") | " Reason: \(.reason)\n Message: \(.message)"' - fi - - # Check node version and kubelet version - KUBELET_VERSION=$($CLI get node "$node" -o json | jq -r '.status.nodeInfo.kubeletVersion') - echo " ℹ️ Kubelet version: $KUBELET_VERSION" - - # Check for taints - TAINTS=$($CLI get node "$node" -o json | jq -r '.spec.taints // [] | length') - if [ "$TAINTS" -gt 0 ]; then - echo " ℹ️ Node has $TAINTS taint(s)" - $CLI get node "$node" -o json | jq -r '.spec.taints[] | " - \(.key)=\(.value):\(.effect)"' - fi -done -``` - -### 4. Check Kubelet Service Health - -Examine kubelet service status on each node using debug pods: - -```bash -echo "Checking Kubelet Service Health..." - -for node in $NODES; do - echo " Checking kubelet on node: $node" - - # Use debug pod to check kubelet service - KUBELET_STATUS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl is-active kubelet 2>/dev/null || echo "failed") - - if [ "$KUBELET_STATUS" != "active" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Kubelet service is not active on node $node (Status: $KUBELET_STATUS)" - - # Get kubelet logs for troubleshooting - if [ "$VERBOSE" = true ]; then - echo " Recent kubelet logs:" - $CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host journalctl -u kubelet -n 20 --no-pager 2>/dev/null - fi - else - # Check for kubelet restarts - KUBELET_RESTART_COUNT=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl show kubelet -p NRestarts --value 2>/dev/null || echo "0") - - if [ "$KUBELET_RESTART_COUNT" -gt 3 ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Kubelet has restarted $KUBELET_RESTART_COUNT times on node $node" - fi - fi - - # Check kubelet certificate expiration - CERT_EXPIRY=$($CLI get node "$node" -o json | jq -r '.status.conditions[] | select(.type=="Ready") | .message' | grep -i "certificate" || echo "") - if [ -n "$CERT_EXPIRY" ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Certificate issue on node $node: $CERT_EXPIRY" - fi -done -``` - -### 5. Check CRI-O Container Runtime Health - -Verify CRI-O runtime health and operations: - -```bash -echo "Checking CRI-O Container Runtime..." - -for node in $NODES; do - echo " Checking CRI-O on node: $node" - - # Check crio service status - CRIO_STATUS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl is-active crio 2>/dev/null || echo "failed") - - if [ "$CRIO_STATUS" != "active" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: CRI-O service is not active on node $node (Status: $CRIO_STATUS)" - - if [ "$VERBOSE" = true ]; then - echo " Recent CRI-O logs:" - $CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host journalctl -u crio -n 20 --no-pager 2>/dev/null - fi - else - # Check CRI-O version - CRIO_VERSION=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host crictl version -o json 2>/dev/null | jq -r '.runtimeVersion // "unknown"') - echo " ℹ️ CRI-O version: $CRIO_VERSION" - - # Check for container runtime errors - RUNTIME_ERRORS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host journalctl -u crio --since "1 hour ago" -p err --no-pager 2>/dev/null | wc -l) - - if [ "$RUNTIME_ERRORS" -gt 10 ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: CRI-O has $RUNTIME_ERRORS errors in the last hour on node $node" - fi - fi -done -``` - -### 6. Check Node Resource Utilization - -Verify resource usage against allocatable capacity: - -```bash -echo "Checking Node Resource Utilization..." - -for node in $NODES; do - echo " Checking resources on node: $node" - - # Get allocatable and capacity - CPU_CAPACITY=$($CLI get node "$node" -o json | jq -r '.status.capacity.cpu') - CPU_ALLOCATABLE=$($CLI get node "$node" -o json | jq -r '.status.allocatable.cpu') - MEMORY_CAPACITY=$($CLI get node "$node" -o json | jq -r '.status.capacity.memory') - MEMORY_ALLOCATABLE=$($CLI get node "$node" -o json | jq -r '.status.allocatable.memory') - PODS_CAPACITY=$($CLI get node "$node" -o json | jq -r '.status.capacity.pods') - - # Get current pod count - POD_COUNT=$($CLI get pods --all-namespaces --field-selector spec.nodeName="$node" --no-headers 2>/dev/null | wc -l) - - echo " CPU: $CPU_ALLOCATABLE/$CPU_CAPACITY allocatable" - echo " Memory: $MEMORY_ALLOCATABLE/$MEMORY_CAPACITY allocatable" - echo " Pods: $POD_COUNT/$PODS_CAPACITY" - - # Check if pod count is near capacity - if [ "$POD_COUNT" -ge "$((PODS_CAPACITY * 90 / 100))" ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Node $node is running $POD_COUNT pods (near capacity of $PODS_CAPACITY)" - fi - - # Check disk usage - if [ "$VERBOSE" = true ]; then - echo " Disk usage:" - $CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host df -h / /var /var/lib/kubelet /var/lib/containers 2>/dev/null | grep -v "Filesystem" - fi - - # Check ephemeral storage pressure - EPHEMERAL_STORAGE=$($CLI get node "$node" -o json | jq -r '.status.allocatable."ephemeral-storage" // "unknown"') - if [ "$EPHEMERAL_STORAGE" != "unknown" ]; then - echo " Ephemeral Storage: $EPHEMERAL_STORAGE allocatable" - fi -done -``` - -### 7. Check System Daemons and Services - -Validate critical system services: - -```bash -echo "Checking System Daemons..." - -CRITICAL_SERVICES="kubelet crio" - -for node in $NODES; do - echo " Checking system services on node: $node" - - for service in $CRITICAL_SERVICES; do - SERVICE_STATUS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl is-active "$service" 2>/dev/null || echo "failed") - - if [ "$SERVICE_STATUS" != "active" ]; then - CRITICAL_ISSUES=$((CRITICAL_ISSUES + 1)) - echo "❌ CRITICAL: Service $service is not active on node $node" - fi - done - - # Check for failed systemd units - FAILED_UNITS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl list-units --state=failed --no-pager --no-legend 2>/dev/null | wc -l) - - if [ "$FAILED_UNITS" -gt 0 ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Node $node has $FAILED_UNITS failed systemd unit(s)" - - if [ "$VERBOSE" = true ]; then - $CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host systemctl list-units --state=failed --no-pager 2>/dev/null - fi - fi -done -``` - -### 8. Check Kernel Parameters and System Tunables - -Verify important kernel parameters for Kubernetes: - -```bash -echo "Checking Kernel Parameters..." - -for node in $NODES; do - if [ "$VERBOSE" = true ]; then - echo " Checking kernel parameters on node: $node" - - # Check key sysctl parameters - echo " Key sysctl parameters:" - $CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host sysctl -a 2>/dev/null | grep -E "(vm.overcommit_memory|vm.panic_on_oom|kernel.panic|kernel.panic_on_oops|net.ipv4.ip_forward)" || true - - # Check SELinux status - SELINUX_STATUS=$($CLI debug node/"$node" --image=registry.access.redhat.com/ubi9/ubi-minimal -- chroot /host getenforce 2>/dev/null || echo "unknown") - echo " SELinux: $SELINUX_STATUS" - fi -done -``` - -### 9. Check Recent Node Events - -Look for recent warning/error events: - -```bash -echo "Checking Recent Node Events..." - -for node in $NODES; do - # Get recent warning events for the node - NODE_EVENTS=$($CLI get events --all-namespaces --field-selector involvedObject.name="$node",involvedObject.kind=Node,type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -10) - - if [ -n "$NODE_EVENTS" ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ Recent Warning Events for Node $node:" - echo "$NODE_EVENTS" - fi - - # Get recent pod events on this node - POD_EVENTS=$($CLI get events --all-namespaces --field-selector spec.nodeName="$node",type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -10) - - if [ -n "$POD_EVENTS" ] && [ "$VERBOSE" = true ]; then - echo "ℹ️ Recent Pod Warning Events on Node $node:" - echo "$POD_EVENTS" - fi -done -``` - -### 10. Check Pod Status on Nodes - -Verify pods running on each node: - -```bash -echo "Checking Pods on Nodes..." - -for node in $NODES; do - echo " Checking pods on node: $node" - - # Count pods by phase - RUNNING_PODS=$($CLI get pods --all-namespaces --field-selector spec.nodeName="$node",status.phase=Running --no-headers 2>/dev/null | wc -l) - PENDING_PODS=$($CLI get pods --all-namespaces --field-selector spec.nodeName="$node",status.phase=Pending --no-headers 2>/dev/null | wc -l) - FAILED_PODS=$($CLI get pods --all-namespaces --field-selector spec.nodeName="$node",status.phase=Failed --no-headers 2>/dev/null | wc -l) - - echo " Running: $RUNNING_PODS, Pending: $PENDING_PODS, Failed: $FAILED_PODS" - - if [ "$FAILED_PODS" -gt 0 ]; then - WARNING_ISSUES=$((WARNING_ISSUES + 1)) - echo "⚠️ WARNING: Node $node has $FAILED_PODS failed pod(s)" - - if [ "$VERBOSE" = true ]; then - $CLI get pods --all-namespaces --field-selector spec.nodeName="$node",status.phase=Failed --no-headers - fi - fi - - # Check for pods with high restart counts - HIGH_RESTART_PODS=$($CLI get pods --all-namespaces --field-selector spec.nodeName="$node" -o json 2>/dev/null | jq -r '.items[] | select(.status.containerStatuses[]? | .restartCount > 5) | "\(.metadata.namespace)/\(.metadata.name) [Restarts: \(.status.containerStatuses[0].restartCount)]"') - - if [ -n "$HIGH_RESTART_PODS" ]; then - WARNING_ISSUES=$((WARNING_ISSUES + $(echo "$HIGH_RESTART_PODS" | wc -l))) - echo "⚠️ WARNING: Pods with high restart count (>5) on node $node:" - echo "$HIGH_RESTART_PODS" | while read pod; do - echo " - $pod" - done - fi -done -``` - -### 11. Check Node Labels and Roles - -Verify node labels and role assignments: - -```bash -echo "Checking Node Labels and Roles..." - -for node in $NODES; do - if [ "$VERBOSE" = true ]; then - echo " Node: $node" - - # Get node roles - ROLES=$($CLI get node "$node" -o json | jq -r '.metadata.labels | to_entries[] | select(.key | startswith("node-role.kubernetes.io/")) | .key' | sed 's/node-role.kubernetes.io\///' | tr '\n' ',' | sed 's/,$//') - echo " Roles: ${ROLES:-none}" - - # Check for custom labels - CUSTOM_LABELS=$($CLI get node "$node" -o json | jq -r '.metadata.labels | to_entries[] | select(.key | startswith("node-role.kubernetes.io/") | not) | "\(.key)=\(.value)"' | head -5) - if [ -n "$CUSTOM_LABELS" ]; then - echo " Custom labels:" - echo "$CUSTOM_LABELS" | while read label; do - echo " - $label" - done - fi - fi -done -``` - -### 12. Generate Summary Report - -Create a summary of findings: - -```bash -echo "" -echo "===============================================" -echo "Node Health Check Summary" -echo "===============================================" -echo "Check Time: $(date)" -echo "Nodes Checked: $(echo $NODES | wc -w)" -echo "" -echo "Results:" -echo " Critical Issues: $CRITICAL_ISSUES" -echo " Warnings: $WARNING_ISSUES" -echo "" - -if [ $CRITICAL_ISSUES -eq 0 ] && [ $WARNING_ISSUES -eq 0 ]; then - echo "✅ OVERALL STATUS: HEALTHY - No issues detected" - exit 0 -elif [ $CRITICAL_ISSUES -gt 0 ]; then - echo "❌ OVERALL STATUS: CRITICAL - Immediate attention required" - exit 1 -else - echo "⚠️ OVERALL STATUS: WARNING - Monitoring recommended" - exit 0 -fi -``` - -### 13. Optional: Export to JSON Format - -If `--output-format json` is specified, export findings as JSON: - -```json -{ - "cluster": { - "checkTime": "2025-11-27T12:00:00Z", - "nodesChecked": 3 - }, - "summary": { - "criticalIssues": 0, - "warnings": 2, - "overallStatus": "WARNING" - }, - "nodes": [ - { - "name": "worker-0", - "status": { - "ready": true, - "memoryPressure": false, - "diskPressure": false, - "pidPressure": false, - "networkUnavailable": false - }, - "kubelet": { - "version": "v1.28.5", - "status": "active", - "restartCount": 0 - }, - "crio": { - "version": "1.28.2", - "status": "active", - "errorCount": 0 - }, - "resources": { - "cpu": { - "capacity": "4", - "allocatable": "3800m" - }, - "memory": { - "capacity": "16Gi", - "allocatable": "15Gi" - }, - "pods": { - "running": 25, - "capacity": 110 - } - }, - "issues": [] - } - ] -} -``` - -## Examples - -### Example 1: Check all nodes in the cluster -```bash -/node:cluster-node-health-check -``` - -Output, for healthy cluster: -```text -Node Health Check Summary -================================================================================ - -OVERALL STATUS: ✅ HEALTHY - -NODE STATUS: -✅ worker-0: Ready (Kubelet v1.28.5, CRI-O 1.28.2) - - CPU: 3800m/4 allocatable, Memory: 15Gi/16Gi - - Pods: 25/110 -✅ worker-1: Ready (Kubelet v1.28.5, CRI-O 1.28.2) - - CPU: 3800m/4 allocatable, Memory: 15Gi/16Gi - - Pods: 28/110 -✅ worker-2: Ready (Kubelet v1.28.5, CRI-O 1.28.2) - - CPU: 3800m/4 allocatable, Memory: 15Gi/16Gi - - Pods: 22/110 - -No critical issues found. All nodes are operating normally. - -DIAGNOSTIC COMMANDS: -Run these commands for detailed information: - -kubectl get nodes -o wide -kubectl describe node -kubectl top nodes -``` - -Output, with issues: -```text -Node Health Check Summary -================================================================================ - -OVERALL STATUS: ⚠️ WARNING - -NODE STATUS: -✅ worker-0: Ready -⚠️ worker-1: Ready (with warnings) -❌ worker-2: Not Ready - -ISSUES FOUND: - -[CRITICAL] Node Not Ready -- Node: worker-2 -- Condition: Ready=False -- Reason: KubeletNotReady -- Message: container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: cni plugin not initialized -- Impact: Node cannot schedule new pods -- Recommended Action: - kubectl describe node worker-2 - kubectl debug node/worker-2 -- chroot /host journalctl -u kubelet -n 50 - -[WARNING] High Pod Restart Count -- Node: worker-1 -- Pod: openshift-monitoring/prometheus-k8s-0 [Restarts: 8] -- Impact: May indicate application or resource issues -- Recommended Action: - kubectl logs -n openshift-monitoring prometheus-k8s-0 --previous - kubectl describe pod -n openshift-monitoring prometheus-k8s-0 - -[WARNING] Disk Pressure -- Node: worker-1 -- Condition: DiskPressure=True -- Message: ephemeral-storage usage(92%) exceeds the threshold(85%) -- Impact: Pods may be evicted to free disk space -- Recommended Action: - kubectl debug node/worker-1 -- chroot /host df -h - kubectl get pods -A --field-selector spec.nodeName=worker-1 -o json | jq '.items[] | {name:.metadata.name, ephemeralStorage:.spec.containers[].resources.requests."ephemeral-storage"}' - -DIAGNOSTIC COMMANDS: - -1. Check node details: - kubectl get nodes -o wide - kubectl describe node worker-2 - -2. Check kubelet logs: - kubectl debug node/worker-2 -- chroot /host journalctl -u kubelet -n 100 - -3. Check CRI-O logs: - kubectl debug node/worker-2 -- chroot /host journalctl -u crio -n 100 - -4. Check node resource usage: - kubectl top nodes -``` - -### Example 2: Check a specific node -```bash -/node:cluster-node-health-check --node worker-1 -``` - -### Example 3: Verbose health check with detailed output -```bash -/node:cluster-node-health-check --verbose -``` - -### Example 4: JSON output for automation -```bash -/node:cluster-node-health-check --output-format json -``` - -## Return Value - -The command returns a structured health report containing: - -- **OVERALL STATUS**: Health summary (Healthy ✅ / Warning ⚠️ / Critical ❌) -- **NODE STATUS**: Status of each checked node with visual indicators -- **ISSUES FOUND**: Detailed list of problems with: - - Severity level (Critical/Warning/Info) - - Node location - - Impact assessment - - Recommended actions -- **DIAGNOSTIC COMMANDS**: kubectl commands for further investigation - -## Common Issues and Remediation - -### Node Not Ready - -**Symptoms**: Node showing Ready=False - -**Investigation**: -```bash -kubectl describe node -kubectl debug node/ -- chroot /host journalctl -u kubelet -n 100 -``` - -**Remediation**: Common causes include: -- Kubelet service failure -- Container runtime (CRI-O) issues -- Network plugin not initialized -- Certificate expiration - -### Kubelet Service Failures - -**Symptoms**: Kubelet service not active or frequently restarting - -**Investigation**: -```bash -kubectl debug node/ -- chroot /host systemctl status kubelet -kubectl debug node/ -- chroot /host journalctl -u kubelet --since "1 hour ago" -``` - -**Remediation**: Check kubelet logs for configuration errors, certificate issues, or API server connectivity problems - -### CRI-O Runtime Issues - -**Symptoms**: Pods failing to start, container runtime errors - -**Investigation**: -```bash -kubectl debug node/ -- chroot /host systemctl status crio -kubectl debug node/ -- chroot /host journalctl -u crio -p err --since "1 hour ago" -kubectl debug node/ -- chroot /host crictl ps -a -``` - -**Remediation**: Check for CRI-O configuration issues, storage problems, or network misconfigurations - -### Memory/Disk Pressure - -**Symptoms**: MemoryPressure=True or DiskPressure=True - -**Investigation**: -```bash -kubectl describe node -kubectl debug node/ -- chroot /host df -h -kubectl debug node/ -- chroot /host free -h -kubectl top node -kubectl top pods --all-namespaces --field-selector spec.nodeName= -``` - -**Remediation**: -- Increase node resources -- Clean up unused images: `crictl rmi --prune` -- Evict or delete unnecessary pods -- Check for pod resource limits - -### Network Unavailable - -**Symptoms**: NetworkUnavailable=True - -**Investigation**: -```bash -kubectl describe node -kubectl get pods -n openshift-multus -o wide -kubectl get pods -n openshift-sdn -o wide # or openshift-ovn-kubernetes -``` - -**Remediation**: Check CNI plugin status, network operator logs, and node network configuration - -### Certificate Expiration - -**Symptoms**: Certificate warnings in kubelet logs - -**Investigation**: -```bash -kubectl debug node/ -- chroot /host openssl x509 -in /var/lib/kubelet/pki/kubelet-client-current.pem -noout -dates -``` - -**Remediation**: Rotate kubelet certificates (automatic in most cases, manual intervention may be needed for expired certs) - -## Security Considerations - -- **Read-only access**: This command primarily reads cluster state, but uses debug pods for node-level inspection -- **Debug pods**: Creates temporary debug pods with host access for system-level checks -- **Sensitive data**: Node logs and system information may contain sensitive data -- **RBAC requirements**: Ensure user has appropriate permissions for nodes, pods, and debug pod creation - -## Notes - -- This command requires appropriate RBAC permissions to view nodes, pods, and create debug pods -- Debug pods are automatically cleaned up after inspection -- Some checks require debug pod creation which may not work in all cluster configurations -- For OpenShift clusters, some checks leverage OpenShift-specific features -- The command provides diagnostic guidance but does not automatically remediate issues -- Regular health checks help catch node issues before they impact workloads -- Node-level issues can cascade to affect pod scheduling and cluster capacity diff --git a/plugins/olm/commands/approve.md b/plugins/olm/commands/approve.md deleted file mode 100644 index 3aa5437b0..000000000 --- a/plugins/olm/commands/approve.md +++ /dev/null @@ -1,305 +0,0 @@ ---- -description: Approve pending InstallPlans for operator installations and upgrades -argument-hint: [namespace] [--all] ---- - -## Name -olm:approve - -## Synopsis -``` -/olm:approve [namespace] [--all] -``` - -## Description -The `olm:approve` command approves pending InstallPlans for operators with manual approval mode. This is required for operators that have `installPlanApproval: Manual` in their Subscription to proceed with installation or upgrades. - -This command helps you: -- Approve operator installations that are waiting for manual approval -- Approve operator upgrades -- Review what will be installed/upgraded before approval -- Batch approve multiple pending InstallPlans - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - Name of the operator - - `$2`: Namespace (optional) - Namespace where operator is installed - - If not provided, searches for the operator across all namespaces - - `$3`: Flag (optional): - - `--all`: Approve all pending InstallPlans in the namespace - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has sufficient privileges - -3. **Locate Operator**: - - If namespace provided, verify operator exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If no namespace provided, search across all namespaces: - ```bash - oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' - ``` - - If not found, display error with suggestions - -4. **Check Subscription Approval Mode**: - - Get Subscription approval mode: - ```bash - oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' - ``` - - If mode is "Automatic", display informational message: - ``` - ℹ️ Operator '{operator-name}' has automatic approval enabled. - InstallPlans are approved automatically and don't require manual intervention. - - Current Subscription approval mode: Automatic - - To switch to manual approval mode: - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"installPlanApproval":"Manual"}}' - ``` - - Exit if automatic (no approval needed) - -5. **Find Pending InstallPlans**: - - Get all InstallPlans for the operator: - ```bash - oc get installplan -n {namespace} -o json - ``` - - Filter for unapproved plans related to this operator: - ```bash - oc get installplan -n {namespace} -o json | \ - jq '.items[] | select(.spec.approved==false and .spec.clusterServiceVersionNames[] | contains("{operator-name}"))' - ``` - - If no pending InstallPlans found: - ``` - ✓ No pending InstallPlans found for operator '{operator-name}' - - The operator is up to date or already approved. - - To check operator status: /olm:status {operator-name} {namespace} - ``` - - Exit with success - -6. **Display InstallPlan Details**: - For each pending InstallPlan, display: - ``` - ⏸️ Pending InstallPlan Found - - InstallPlan: {installplan-name} - Namespace: {namespace} - Phase: {phase} - Approved: false - - ClusterServiceVersions to be installed/upgraded: - - {csv-name-1} ({version-1}) - - {csv-name-2} ({version-2}) - - Resources to be created/updated: - - CustomResourceDefinitions: {crd-count} - - ServiceAccounts: {sa-count} - - ClusterRoles: {role-count} - - Deployments: {deployment-count} - - [If upgrade:] - Current Version: {current-version} - Target Version: {target-version} - ``` - -7. **Request User Confirmation** (unless `--all` or `--force` flag): - - Display confirmation prompt: - ``` - Do you want to approve this InstallPlan? (yes/no) - ``` - - If user says no, skip this InstallPlan - - If user says yes, proceed to approval - -8. **Approve InstallPlan**: - - Patch the InstallPlan to approve it: - ```bash - oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - ``` - - Verify approval: - ```bash - oc get installplan {installplan-name} -n {namespace} -o jsonpath='{.spec.approved}' - ``` - - Display confirmation: - ``` - ✓ InstallPlan approved: {installplan-name} - ``` - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - -9. **Monitor InstallPlan Execution** (optional): - - Watch InstallPlan phase change to "Complete": - ```bash - oc get installplan {installplan-name} -n {namespace} -w --timeout=120s - ``` - - Display progress: - ``` - 🔄 InstallPlan executing... - ⏳ Installing resources... - ``` - -10. **Verify Installation/Upgrade**: - - Wait for CSV to reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -o json | \ - jq -r '.items[] | select(.status.phase=="Succeeded") | .metadata.name' - ``` - - Display result: - ``` - ✓ Operator installation/upgrade complete - - CSV: {csv-name} - Version: {version} - Phase: Succeeded - - To check operator status: /olm:status {operator-name} {namespace} - ``` - -11. **Handle Multiple InstallPlans** (if `--all` flag): - - Process all pending InstallPlans for the operator - - Display summary: - ``` - ✓ Approved {count} InstallPlan(s) - - Approved: - - {installplan-1} - - {installplan-2} - - Monitoring installation progress... - ``` - -12. **Display Approval Summary**: - ``` - ✓ Approval Complete! - - Operator: {operator-name} - Namespace: {namespace} - Approved InstallPlans: {count} - - InstallPlan Status: - - {installplan-1}: Complete - - {installplan-2}: Installing... - - Monitor progress: watch oc get csv,installplan -n {namespace} - ``` - -## Return Value -- **Success**: InstallPlan(s) approved successfully -- **No Pending Plans**: No InstallPlans require approval -- **Automatic Mode**: Operator has automatic approval (no action needed) -- **Error**: Approval failed with specific error message -- **Format**: Structured output showing: - - Approved InstallPlan names - - Installation/upgrade status - - Next steps or related commands - -## Examples - -1. **Approve pending InstallPlan for an operator**: - ``` - /olm:approve openshift-cert-manager-operator - ``` - -2. **Approve with specific namespace**: - ``` - /olm:approve external-secrets-operator eso-operator - ``` - -3. **Approve all pending InstallPlans**: - ``` - /olm:approve openshift-cert-manager-operator cert-manager-operator --all - ``` - This approves all pending InstallPlans for the operator in the namespace. - -4. **Check and approve after upgrade command**: - ``` - /olm:upgrade openshift-cert-manager-operator --channel=tech-preview - # Wait for InstallPlan to be created - /olm:approve openshift-cert-manager-operator - ``` - -## Arguments -- **$1** (operator-name): Name of the operator (required) - - Example: "openshift-cert-manager-operator" - - Must match the operator's Subscription name -- **$2** (namespace): Namespace where operator is installed (optional) - - If not provided, searches all namespaces - - Example: "cert-manager-operator" -- **$3** (flag): Optional flag - - `--all`: Approve all pending InstallPlans for this operator - - Useful when multiple upgrades are pending - - Skips individual confirmation prompts - -## Notes - -- **Manual Approval Mode**: This command only works for operators with `installPlanApproval: Manual` in their Subscription -- **Automatic Operators**: Operators with automatic approval don't need this command -- **Review Before Approval**: Always review what will be installed/upgraded before approving -- **Multiple InstallPlans**: An operator may have multiple pending InstallPlans if updates accumulated while waiting for approval -- **InstallPlan Retention**: Approved InstallPlans remain in the namespace for audit purposes - -## Troubleshooting - -- **No pending InstallPlans**: - ```bash - # List all InstallPlans - oc get installplan -n {namespace} - - # Check if operator is in automatic mode - oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' - ``` - -- **InstallPlan not executing after approval**: - ```bash - # Check InstallPlan status - oc describe installplan {installplan-name} -n {namespace} - - # Check for errors - oc get events -n {namespace} --sort-by='.lastTimestamp' | grep InstallPlan - ``` - -- **CSV not reaching Succeeded phase**: - ```bash - # Check CSV status - oc describe csv -n {namespace} - - # Check operator deployment - oc get deployments -n {namespace} - - # Check operator logs - oc logs -n {namespace} deployment/{operator-deployment} - ``` - -- **Permission denied**: - ```bash - # Check if you can patch InstallPlans - oc auth can-i patch installplan -n {namespace} - ``` - -- **Multiple namespaces found**: - - Specify the namespace explicitly in the command: - ``` - /olm:approve {operator-name} {specific-namespace} - ``` - -## Related Commands - -- `/olm:status ` - Check if InstallPlans are pending approval -- `/olm:upgrade ` - Trigger upgrade and approve in one command -- `/olm:install ` - Install operator with approval mode -- `/olm:list` - List operators and their approval modes - -## Additional Resources - -- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) -- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - diff --git a/plugins/olm/commands/catalog.md b/plugins/olm/commands/catalog.md deleted file mode 100644 index cd43964ea..000000000 --- a/plugins/olm/commands/catalog.md +++ /dev/null @@ -1,433 +0,0 @@ ---- -description: Manage catalog sources for discovering and installing operators -argument-hint: [arguments] ---- - -## Name -olm:catalog - -## Synopsis -``` -/olm:catalog list -/olm:catalog add [--namespace=openshift-marketplace] -/olm:catalog remove [--namespace=openshift-marketplace] -/olm:catalog refresh [--namespace=openshift-marketplace] -/olm:catalog status [--namespace=openshift-marketplace] -``` - -## Description -The `olm:catalog` command manages catalog sources for operator discovery and installation. Catalog sources provide the list of operators available for installation in the cluster. - -This command helps you: -- List all available catalog sources and their health status -- Add custom or private catalog sources -- Remove catalog sources -- Refresh catalog sources to get latest operator updates - -## Implementation - -### Subcommand: list - -1. **Get All CatalogSources**: - ```bash - oc get catalogsource -n openshift-marketplace -o json - ``` - -2. **Parse CatalogSource Data**: - For each catalog, extract: - - Name: `.metadata.name` - - Display Name: `.spec.displayName` - - Publisher: `.spec.publisher` - - Source Type: `.spec.sourceType` (grpc, configmap, etc.) - - Image: `.spec.image` (for grpc type) - - Connection State: `.status.connectionState.lastObservedState` - - Last Updated: `.status.connectionState.lastUpdatedTime` - - Number of Operators: Count from PackageManifests with this catalog - -3. **Get Catalog Pod Status**: - ```bash - oc get pods -n openshift-marketplace -l olm.catalogSource={catalog-name} - ``` - -4. **Format Output**: - ``` - ═══════════════════════════════════════════════════════════ - CATALOG SOURCES - ═══════════════════════════════════════════════════════════ - - NAME STATUS OPERATORS LAST UPDATED SOURCE TYPE - redhat-operators READY 150 2h ago grpc - certified-operators READY 45 3h ago grpc - community-operators READY 200 1h ago grpc - redhat-marketplace READY 30 4h ago grpc - custom-catalog FAILED 0 - grpc - - ═══════════════════════════════════════════════════════════ - DETAILS - ═══════════════════════════════════════════════════════════ - - redhat-operators: - Display Name: Red Hat Operators - Publisher: Red Hat - Image: registry.redhat.io/redhat/redhat-operator-index:v4.20 - Pod: redhat-operators-abc123 (Running) - - custom-catalog (FAILED): - Display Name: Custom Catalog - Publisher: My Company - Image: registry.example.com/custom-catalog:latest - Pod: custom-catalog-xyz789 (CrashLoopBackOff) - Error: ImagePullBackOff - - To troubleshoot: - /olm:catalog status custom-catalog - ``` - -### Subcommand: add - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `image`: Catalog image (required) - - `--namespace`: Target namespace (default: openshift-marketplace) - - `--display-name`: Display name (optional) - - `--publisher`: Publisher name (optional) - -2. **Validate Image**: - - Check if image format is valid - - Optionally test image accessibility (if possible) - -3. **Create CatalogSource Manifest**: - ```yaml - apiVersion: operators.coreos.com/v1alpha1 - kind: CatalogSource - metadata: - name: {name} - namespace: {namespace} - spec: - sourceType: grpc - image: {image} - displayName: {display-name} - publisher: {publisher} - updateStrategy: - registryPoll: - interval: 30m - ``` - -4. **Apply CatalogSource**: - ```bash - oc apply -f /tmp/catalogsource-{name}.yaml - ``` - -5. **Wait for CatalogSource to be Ready**: - ```bash - oc wait --for=condition=READY catalogsource/{name} -n {namespace} --timeout=300s - ``` - -6. **Verify Pod is Running**: - ```bash - oc get pods -n {namespace} -l olm.catalogSource={name} - ``` - -7. **Display Result**: - ``` - ✓ Catalog source added: {name} - - Name: {name} - Namespace: {namespace} - Image: {image} - Status: READY - Pod: {pod-name} (Running) - - To search operators: /olm:search --catalog {name} - ``` - -### Subcommand: remove - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Check if CatalogSource Exists**: - ```bash - oc get catalogsource {name} -n {namespace} --ignore-not-found - ``` - -3. **Check for Operators Using This Catalog**: - ```bash - oc get subscription --all-namespaces -o json | \ - jq -r '.items[] | select(.spec.source=="{name}") | "\(.metadata.namespace)/\(.metadata.name)"' - ``` - -4. **Display Warning** (if operators found): - ``` - WARNING: The following operators are using this catalog: - - namespace-1/operator-1 - - namespace-2/operator-2 - - Removing this catalog will prevent these operators from receiving updates. - - Do you want to continue? (yes/no) - ``` - -5. **Delete CatalogSource**: - ```bash - oc delete catalogsource {name} -n {namespace} - ``` - -6. **Wait for Pod to be Deleted**: - ```bash - oc wait --for=delete pod -l olm.catalogSource={name} -n {namespace} --timeout=60s - ``` - -7. **Display Result**: - ``` - ✓ Catalog source removed: {name} - ``` - -### Subcommand: refresh - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Get Current CatalogSource**: - ```bash - oc get catalogsource {name} -n {namespace} -o json - ``` - -3. **Trigger Refresh by Deleting Pod**: - ```bash - oc delete pod -n {namespace} -l olm.catalogSource={name} - ``` - - This forces OLM to recreate the pod and re-fetch catalog data - -4. **Wait for New Pod to be Ready**: - ```bash - oc wait --for=condition=Ready pod -l olm.catalogSource={name} -n {namespace} --timeout=300s - ``` - -5. **Verify Catalog is Updated**: - ```bash - oc get catalogsource {name} -n {namespace} -o json | \ - jq -r '.status.connectionState.lastUpdatedTime' - ``` - -6. **Display Result**: - ``` - ✓ Catalog source refreshed: {name} - - Last Updated: {timestamp} - Status: READY - Pod: {pod-name} (Running) - - New operators may now be available: /olm:search --catalog {name} - ``` - -### Subcommand: status - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Get CatalogSource Details**: - ```bash - oc get catalogsource {name} -n {namespace} -o json - ``` - -3. **Get Pod Details**: - ```bash - oc get pods -n {namespace} -l olm.catalogSource={name} -o json - ``` - -4. **Get Recent Events**: - ```bash - oc get events -n {namespace} --field-selector involvedObject.name={name} --sort-by='.lastTimestamp' - ``` - -5. **Count Available Operators**: - ```bash - oc get packagemanifests -n openshift-marketplace -o json | \ - jq -r '.items[] | select(.status.catalogSource=="{name}") | .metadata.name' | wc -l - ``` - -6. **Verify Catalog Connectivity**: - - Check if catalog is serving content by verifying PackageManifest count > 0 - - If count is 0 but pod is Running, indicates connectivity or catalog index issues - - Review catalog pod logs for gRPC errors, image pull issues, or index corruption: - ```bash - oc logs -n {namespace} {catalog-pod-name} - ``` - -7. **Format Comprehensive Status Report**: - ``` - ═══════════════════════════════════════════════════════════ - CATALOG SOURCE STATUS: {name} - ═══════════════════════════════════════════════════════════ - - General Information: - Name: {name} - Namespace: {namespace} - Display Name: {display-name} - Publisher: {publisher} - Source Type: {source-type} - Image: {image} - - Connection Status: - State: {state} (READY | CONNECTING | CONNECTION_FAILED) - Last Updated: {timestamp} - Last Successful: {timestamp} - - Pod Status: - Name: {pod-name} - Status: {status} (Running | CrashLoopBackOff | ImagePullBackOff) - Ready: {ready-containers}/{total-containers} - Restarts: {restart-count} - Age: {age} - - Catalog Content: - Operators Available: {count} - - [If issues detected:] - ⚠️ Issues Detected: - - Pod in CrashLoopBackOff - - Last update: 24h ago (stale) - - Connection state: CONNECTION_FAILED - - Recent Events: - {timestamp} Warning: Failed to pull image - {timestamp} Warning: Back-off restarting failed container - - Troubleshooting Steps: - 1. Check pod logs: oc logs -n {namespace} {pod-name} - 2. Check image accessibility - 3. Refresh catalog: /olm:catalog refresh {name} - 4. Verify network connectivity (for disconnected environments) - - Related Commands: - - Refresh: /olm:catalog refresh {name} - - List operators: /olm:search --catalog {name} - ``` - -## Return Value -- **list**: Table of all catalog sources with status -- **add**: Confirmation of added catalog with details -- **remove**: Confirmation of removed catalog -- **refresh**: Confirmation of refresh with updated timestamp -- **status**: Comprehensive status report for specific catalog - -## Examples - -1. **List all catalog sources**: - ``` - /olm:catalog list - ``` - -2. **Add custom catalog**: - ``` - /olm:catalog add my-catalog registry.example.com/my-catalog:v1.0 - ``` - -3. **Add catalog with metadata**: - ``` - /olm:catalog add my-catalog registry.example.com/catalog:latest \ - --display-name="My Custom Catalog" \ - --publisher="My Company" - ``` - -4. **Remove catalog**: - ``` - /olm:catalog remove my-catalog - ``` - -5. **Refresh catalog to get latest operators**: - ``` - /olm:catalog refresh redhat-operators - ``` - -6. **Check catalog health**: - ``` - /olm:catalog status custom-catalog - ``` - -7. **Add catalog for disconnected environment**: - ``` - /olm:catalog add disconnected-operators \ - mirror-registry.local:5000/olm/redhat-operators:v4.20 \ - --namespace=openshift-marketplace - ``` - -## Arguments - -### list -No arguments required. - -### add -- **name** (required): Name for the catalog source -- **image** (required): Container image containing the catalog -- **--namespace**: Target namespace (default: openshift-marketplace) -- **--display-name**: Human-readable display name -- **--publisher**: Publisher/organization name - -### remove -- **name** (required): Name of the catalog source to remove -- **--namespace**: Namespace (default: openshift-marketplace) - -### refresh -- **name** (required): Name of the catalog source to refresh -- **--namespace**: Namespace (default: openshift-marketplace) - -### status -- **name** (required): Name of the catalog source to check -- **--namespace**: Namespace (default: openshift-marketplace) - -## Troubleshooting - -- **Catalog pod failing**: - ```bash - # Check pod logs - oc logs -n openshift-marketplace {catalog-pod-name} - - # Check image pull issues - oc describe pod -n openshift-marketplace {catalog-pod-name} - ``` - -- **No operators showing up**: - ```bash - # Verify catalog is ready - /olm:catalog status {catalog-name} - - # Check PackageManifests - oc get packagemanifests -n openshift-marketplace - ``` - -- **Image pull errors (disconnected environment)**: - - Verify image registry is accessible - - Check pull secrets are configured - - Ensure image has been mirrored correctly - -- **Stale catalog data**: - ```bash - # Force refresh - /olm:catalog refresh {catalog-name} - ``` - -- **Connection failures**: - ```bash - # Check catalog source definition - oc get catalogsource {catalog-name} -n openshift-marketplace -o yaml - - # Run cluster diagnostics - /olm:diagnose --cluster - ``` - -## Related Commands - -- `/olm:search` - Search for operators in catalogs -- `/olm:install` - Install operators from catalogs -- `/olm:diagnose` - Diagnose catalog health issues - -## Additional Resources -- [Building Catalog Images with opm](https://olm.operatorframework.io/docs/tasks/creating-catalog-from-index/) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - diff --git a/plugins/olm/commands/install.md b/plugins/olm/commands/install.md deleted file mode 100644 index ccc0bcfc2..000000000 --- a/plugins/olm/commands/install.md +++ /dev/null @@ -1,272 +0,0 @@ ---- -description: Install a day-2 operator using Operator Lifecycle Manager -argument-hint: [namespace] [channel] [source] [--approval=Automatic|Manual] ---- - -## Name -olm:install - -## Synopsis -``` -/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual] -``` - -## Description -The `olm:install` command installs a day-2 operator in an OpenShift cluster using Operator Lifecycle Manager (OLM). It automates the creation of the required namespace, OperatorGroup, and Subscription resources needed to install an operator. - -This command handles the complete operator installation workflow: -- Creates or verifies the target namespace exists -- Creates an OperatorGroup if needed -- Creates a Subscription to install the operator -- Verifies the installation by checking the operator's CSV (ClusterServiceVersion) status -- Provides detailed feedback on the installation progress - -The command is designed to work with operators from the OperatorHub catalog, including Red Hat certified operators, community operators, and custom catalog sources. - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - The name of the operator to install (e.g., "openshift-cert-manager-operator") - - `$2`: Namespace (optional) - Target namespace for the operator. If not provided, defaults to `{operator-name}-operator` (e.g., "cert-manager-operator") - - `$3`: Channel (optional) - Subscription channel. If not provided, discovers the default channel from the operator's PackageManifest - - `$4`: Source (optional) - CatalogSource name. Defaults to "redhat-operators" for Red Hat operators - - `$5+`: Flags (optional): - - `--approval=Automatic|Manual`: InstallPlan approval mode (default: Automatic) - - Automatic: Operator upgrades are automatically installed - - Manual: Operator upgrades require manual approval via `/olm:approve` or `oc patch` - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has cluster-admin or sufficient privileges - - If not installed or not authenticated, provide clear instructions - -3. **Discover Operator Metadata** (if channel or source not provided): - - Search for the operator in available catalogs: - ```bash - oc get packagemanifests -n openshift-marketplace | grep {operator-name} - ``` - - Get the PackageManifest details: - ```bash - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - - Extract: - - Default channel: `.status.defaultChannel` - - CatalogSource: `.status.catalogSource` - - CatalogSourceNamespace: `.status.catalogSourceNamespace` - - If operator not found, provide error with list of available operators - -4. **Create Namespace**: - - Check if namespace exists: `oc get namespace {namespace} --ignore-not-found` - - If not exists, create it: - ```bash - oc create namespace {namespace} - ``` - - If exists, inform user and continue - -5. **Create OperatorGroup**: - - Check if OperatorGroup exists in the namespace: - ```bash - oc get operatorgroup -n {namespace} --ignore-not-found - ``` - - If no OperatorGroup exists, create one: - ```yaml - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: {namespace}-operatorgroup - namespace: {namespace} - spec: - targetNamespaces: - - {namespace} - ``` - - Save to temporary file and apply: - ```bash - oc apply -f /tmp/operatorgroup-{operator-name}.yaml - ``` - - If OperatorGroup already exists, inform user and continue - -6. **Create Subscription**: - - Parse approval mode from flags (default: Automatic) - - Create Subscription manifest: - ```yaml - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: {operator-name} - namespace: {namespace} - spec: - channel: {channel} - name: {operator-name} - source: {source} - sourceNamespace: openshift-marketplace - installPlanApproval: {Automatic|Manual} - ``` - - Save to temporary file and apply: - ```bash - oc apply -f /tmp/subscription-{operator-name}.yaml - ``` - - Display the created subscription details - - If approval mode is Manual, display informational message: - ``` - ℹ️ InstallPlan approval set to Manual - You will need to manually approve InstallPlans for this operator. - Use: /olm:approve {operator-name} {namespace} - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - ``` - -7. **Verify Installation**: - - Wait for InstallPlan to be created: - ```bash - oc get installplan -n {namespace} -l operators.coreos.com/operator={operator-name} - ``` - - If approval mode is Manual, check if InstallPlan needs approval: - ```bash - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - ``` - - If Manual and not approved, display message: - ``` - ⏸️ InstallPlan created but requires manual approval - - InstallPlan: {installplan-name} - To approve: /olm:approve {operator-name} {namespace} - Or manually: oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - - Waiting for approval... - ``` - - Wait for CSV to be created and reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -w - ``` - - Use a timeout of 5 minutes for the installation to complete (10 minutes if Manual approval) - - Poll every 10 seconds to check CSV status - - Display progress updates to the user - -8. **Display Results**: - - Show the installed operator's CSV name and version - - Show the operator deployment status: - ```bash - oc get deployments -n {namespace} - ``` - - List any pods created by the operator: - ```bash - oc get pods -n {namespace} - ``` - - Display success message with next steps or usage instructions - -9. **Cleanup Temporary Files**: - - Remove temporary YAML files created during installation: - ```bash - rm -f /tmp/operatorgroup-{operator-name}.yaml /tmp/subscription-{operator-name}.yaml - ``` - -## Return Value -- **Success**: Operator installed successfully with details about the CSV, deployments, and pods -- **Error**: Installation failed with specific error message and troubleshooting suggestions -- **Format**: Structured output showing: - - Namespace created/used - - OperatorGroup status - - Subscription created - - CSV status and version - - Deployment and pod status - -## Examples - -1. **Install cert-manager-operator with defaults**: - ``` - /olm:install openshift-cert-manager-operator - ``` - This will: - - Create namespace `cert-manager-operator` - - Discover default channel from PackageManifest - - Use `redhat-operators` catalog source - - Install the operator - -2. **Install cert-manager-operator with custom namespace**: - ``` - /olm:install openshift-cert-manager-operator my-cert-manager - ``` - This will install the operator in the `my-cert-manager` namespace. - -3. **Install with specific channel**: - ``` - /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 - ``` - This will install from the `stable-v1` channel. - -4. **Install from community catalog**: - ``` - /olm:install prometheus community-operators stable community-operators - ``` - This will install Prometheus from the community-operators catalog. - -5. **Install Red Hat Advanced Cluster Security**: - ``` - /olm:install rhacs-operator rhacs-operator stable - ``` - -6. **Install with manual approval mode**: - ``` - /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 redhat-operators --approval=Manual - ``` - This will install the operator but require manual approval for all upgrades. - -7. **Install with all parameters specified**: - ``` - /olm:install external-secrets-operator eso-operator stable-v0.10 redhat-operators --approval=Automatic - ``` - -## Arguments -- **$1** (operator-name): The name of the operator to install (required) - - Example: "openshift-cert-manager-operator" - - Must match the name in the operator's PackageManifest -- **$2** (namespace): Target namespace for the operator installation (optional) - - Default: `{operator-name}` (operator name without "openshift-" prefix if present) - - Example: "cert-manager-operator" -- **$3** (channel): Subscription channel (optional) - - Default: Auto-discovered from PackageManifest's default channel - - Example: "stable-v1", "tech-preview", "stable" -- **$4** (source): CatalogSource name (optional) - - Default: "redhat-operators" - - Other options: "certified-operators", "community-operators", "redhat-marketplace" -- **$5+** (flags): Optional flags - - `--approval=Automatic|Manual`: InstallPlan approval mode - - **Automatic** (default): Operator upgrades are automatically installed without user intervention - - **Manual**: Operator upgrades require explicit approval. Useful for: - - Production environments requiring change control - - Testing upgrades before applying - - Preventing unexpected operator updates - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - -## Notes - -- **Automatic Channel Discovery**: If no channel is specified, the command automatically discovers and uses the operator's default channel from its PackageManifest -- **Namespace Convention**: By default, operators are installed in a namespace following the pattern `{operator-name}-operator` -- **OperatorGroup Scope**: The created OperatorGroup targets only the installation namespace for better isolation -- **InstallPlan Approval**: Set to "Automatic" by default for seamless installation. Can be changed to "Manual" using `--approval=Manual` flag -- **Manual Approval Mode**: When using `--approval=Manual`: - - Initial installation may require manual approval of the InstallPlan - - All future upgrades will require explicit approval via `/olm:approve` command - - Provides better control over operator updates in production environments -- **Verification Timeout**: The command waits up to 5 minutes for the operator to install successfully (10 minutes for manual approval mode) -- **Cleanup**: Temporary YAML files are automatically removed after installation - -## Troubleshooting - -- **Operator not found**: Run `oc get packagemanifests -n openshift-marketplace` to see available operators -- **Permission denied**: Ensure you have cluster-admin privileges or the necessary RBAC permissions -- **Installation timeout**: Check the InstallPlan and CSV status manually: - ```bash - oc get installplan -n {namespace} - oc get csv -n {namespace} - oc describe csv -n {namespace} - ``` -- **Operator pod not starting**: Check pod logs: - ```bash - oc logs -n {namespace} deployment/{operator-deployment} - ``` - diff --git a/plugins/olm/commands/opm.md b/plugins/olm/commands/opm.md deleted file mode 100644 index bd30fe0de..000000000 --- a/plugins/olm/commands/opm.md +++ /dev/null @@ -1,359 +0,0 @@ ---- -description: Execute opm (Operator Package Manager) commands for building and managing operator catalogs -argument-hint: [arguments...] ---- - -## Name -olm:opm - -## Synopsis -```bash -/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] -/olm:opm list packages -/olm:opm list channels [package-name] -/olm:opm list bundles [package-name] -``` - -## Description -The `olm:opm` command provides a unified interface to `opm` (Operator Package Manager) operations for building and managing operator catalog indexes. It supports building catalog indexes, generating semver templates, and querying catalog contents. - -## Arguments -- `$1`: **action** - The action to perform: - - `build-index-image`: Build an index from an existing catalog directory - - `build-semver-index-image`: Build an index from a semver template - - `generate-semver-template`: Generate a semver template file - - `list`: List catalog contents (requires second argument: `packages`, `channels`, or `bundles`) -- `$2+`: Additional arguments specific to each action (see Actions section below) - -## Actions - -### build-index-image -Build an operator catalog index image from an existing catalog directory. - -**Synopsis:** -```bash -/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -``` - -**Arguments:** -- `$2`: **catalog-path** - Path to the catalog directory containing the index configuration -- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) -- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) -- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) -- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) -- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) - -**Examples:** -```bash -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --cacheless -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 -``` - -### build-semver-index-image -Build a multi-architecture operator catalog index image using the semver template format. - -**Synopsis:** -```bash -/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -``` - -**Arguments:** -- `$2`: **semver-template-file** - Path to the semver template configuration file (e.g., `catalog-config.yaml`) -- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) -- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) -- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) -- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) -- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) - -**Examples:** -```bash -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --cacheless -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=multi -``` - -### generate-semver-template -Generate a semver template configuration file for building operator catalogs. - -**Synopsis:** -```bash -/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] -``` - -**Arguments:** -- `$2`: **bundle-list** - Comma-separated list of bundle image references (e.g., `quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1`) -- `--output=`: Optional output file path (default: `catalog-semver-config.yaml` in current directory) -- `--major=true|false`: Optional flag to generate major version channels (default: `true`) -- `--minor=true|false`: Optional flag to generate minor version channels (default: `false`) - -**Examples:** -```bash -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 --output=my-catalog.yaml -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.1.0 --minor=true -``` - -### list packages -List all operator packages available in a catalog index. - -**Synopsis:** -```bash -/olm:opm list packages -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **packages** - Must be "packages" -- `$4`: **index-ref** - Catalog index reference, either: - - Image tag: `quay.io/myorg/mycatalog:v1.0.0` - - Directory path: `./catalog` or `/path/to/catalog` - -**Examples:** -```bash -/olm:opm list packages quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list packages ./catalog -``` - -### list channels -List channels for operator packages in a catalog index. - -**Synopsis:** -```bash -/olm:opm list channels [package-name] -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **channels** - Must be "channels" -- `$4`: **index-ref** - Catalog index reference (image tag or directory path) -- `$5`: **package-name** (Optional) - Name of a specific package to list channels for - -**Examples:** -```bash -/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 nginx85187 -/olm:opm list channels ./catalog -``` - -### list bundles -List bundles for operator packages in a catalog index. - -**Synopsis:** -```bash -/olm:opm list bundles [package-name] -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **bundles** - Must be "bundles" -- `$4`: **index-ref** - Catalog index reference (image tag or directory path) -- `$5`: **package-name** (Optional) - Name of a specific package to list bundles for - -**Examples:** -```bash -/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 nginx85187 -/olm:opm list bundles ./catalog -``` - -## Implementation - -### Step 1: Parse Action -- Extract the action from `$1` -- Validate the action is one of: `build-index-image`, `build-semver-index-image`, `generate-semver-template`, `list` -- If invalid action, display error with available actions - -### Step 2: Check Prerequisites -Verify required tools are installed: -- Check for `opm`: `which opm` - - If not found, provide installation instructions: -- For build actions, also check for `podman`: `which podman` - - If not found, provide installation instructions based on user's platform - -### Step 3: Route to Action Handler -Based on the action, call the appropriate implementation: - -#### For `build-index-image`: -1. **Parse Arguments and Set Defaults** - - Extract catalog path from `$2` - - Extract index image tag from `$3` - - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` - - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` - -2. **Verify Catalog Directory** - - Check catalog directory exists: `test -d ` - -3. **Validate Catalog** - ```bash - opm validate - ``` - -4. **Generate Dockerfile** - - If cacheless: `opm generate dockerfile --base-image=scratch` - - If normal: `opm generate dockerfile -b -i ` - -5. **Determine Build Platform** - - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` - - Otherwise: `linux/` - -6. **Create Podman Manifest** - ```bash - podman manifest rm 2>/dev/null || true - podman manifest create - ``` - -7. **Build Image** - ```bash - podman build --platform --manifest . -f catalog.Dockerfile - ``` - -8. **Push Manifest** - ```bash - podman manifest push - ``` - -9. **List Bundles in Index** - ```bash - opm alpha list bundles - ``` - -10. **Display Success Message** - -#### For `build-semver-index-image`: -1. **Parse Arguments and Set Defaults** - - Extract semver template file from `$2` - - Extract index image tag from `$3` - - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` - - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` - -2. **Verify Template File** - - Check file exists: `test -f ` - -3. **Create Catalog and Render Template** - ```bash - mkdir -p catalog - opm alpha render-template semver -o yaml > catalog/index.yaml - ``` - -4. **Validate Catalog** - ```bash - opm validate catalog - ``` - -5. **Generate Dockerfile** - - If cacheless: `opm generate dockerfile catalog --base-image=scratch` - - If normal: `opm generate dockerfile catalog -b -i ` - -6. **Determine Build Platform** - - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` - - Otherwise: `linux/` - -7. **Create Podman Manifest** - ```bash - podman manifest rm 2>/dev/null || true - podman manifest create - ``` - -8. **Build Image** - ```bash - podman build --platform --manifest . -f catalog.Dockerfile - ``` - -9. **Push Manifest** - ```bash - podman manifest push - ``` - -10. **List Bundles in Index** - ```bash - opm alpha list bundles - ``` - -11. **Display Success Message** - -#### For `generate-semver-template`: -1. **Parse Arguments and Set Defaults** - - Extract bundle list from `$2` - - Parse optional flags: `--output`, `--major`, `--minor` - - Set defaults: output=`catalog-semver-config.yaml`, major=`true`, minor=`false` - -2. **Validate Bundle List** - - Split by commas - - Validate each bundle is a valid image reference - -3. **Generate YAML Content** - ```yaml - Schema: olm.semver - GenerateMajorChannels: - GenerateMinorChannels: - Candidate: - Bundles: - - Image: - - Image: - ``` - -4. **Write Template File** - - Check if file exists and confirm overwrite if needed - - Write YAML content - -5. **Validate Generated File** - - Read back and verify YAML is well-formed - -6. **Display Success Message** - - Show file path, bundles included, settings - - Suggest next step: `/olm:opm build-semver-index-image ` - -#### For `list`: -1. **Parse List Type** - - Extract list type from `$2` (must be `packages`, `channels`, or `bundles`) - - If invalid, display error with available types - -2. **Parse Index Reference and Optional Package** - - Extract index-ref from `$3` - - Extract optional package-name from `$4` (for channels and bundles) - -3. **Determine Reference Type** - - Check if directory: `test -d ` - -4. **Execute List Command** - - For packages: `opm alpha list packages ` - - For channels: `opm alpha list channels [package-name]` - - For bundles: `opm alpha list bundles [package-name]` - -5. **Display Results** - - Show the output with appropriate formatting - - Display count of items found - -## Return Value - -**Format**: Varies by action - -- **build-index-image / build-semver-index-image**: Success message with image tag, architectures, and bundle list -- **generate-semver-template**: Success message with file path and configuration details -- **list**: Table or list of catalog contents - -On failure, displays: -- Clear error message indicating which step/action failed -- Relevant tool output for debugging -- Suggestions for resolution - -## Notes - -- Ensure you are authenticated to container registries before building/pushing images (use `podman login`) -- For build operations, the `catalog.Dockerfile` is created in the current working directory -- Multi-architecture builds can be time-consuming -- Cacheless builds result in smaller images and use `scratch` as the base image -- When using `--cacheless`, the `--base-image` and `--builder-image` options are ignored (scratch is always used as base) -- Index references can be either image tags or local directory paths -- Bundle images must be accessible from where you build the catalog -- Image tags should include the full registry hostname (e.g., `quay.io/org/image:tag` not `quay/org/image:tag`) - -## Related Commands - -- `/olm:install` - Install an operator using OLM -- `/olm:catalog` - Manage catalog sources -- `/olm:debug` - Debug OLM issues diff --git a/plugins/olm/commands/uninstall.md b/plugins/olm/commands/uninstall.md deleted file mode 100644 index 36c3ec14c..000000000 --- a/plugins/olm/commands/uninstall.md +++ /dev/null @@ -1,392 +0,0 @@ ---- -description: Uninstall a day-2 operator and optionally remove its resources -argument-hint: [namespace] [--remove-crds] [--remove-namespace] ---- - -## Name -olm:uninstall - -## Synopsis -``` -/olm:uninstall [namespace] [--remove-crds] [--remove-namespace] -``` - -## Description -The `olm:uninstall` command uninstalls a day-2 operator from an OpenShift cluster by removing its Subscription, ClusterServiceVersion (CSV), and optionally its Custom Resource Definitions (CRDs) and namespace. - -This command provides a comprehensive uninstallation workflow: -- Removes the operator's Subscription -- Deletes the ClusterServiceVersion (CSV) -- Optionally removes operator-managed deployments -- Optionally deletes Custom Resource Definitions (CRDs) -- Optionally removes the operator's namespace -- Provides detailed feedback on each step - -The command is designed to safely clean up operators installed via OLM, with optional flags for thorough cleanup of all operator-related resources. - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - The name of the operator to uninstall - - `$2`: Namespace (optional) - The namespace where operator is installed. If not provided, defaults to `{operator-name}-operator` - - `$3+`: Flags (optional): - - `--remove-crds`: Remove Custom Resource Definitions after uninstalling - - `--remove-namespace`: Remove the operator's namespace after cleanup - - `--force`: Skip confirmation prompts - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has cluster-admin or sufficient privileges - -3. **Verify Operator Installation**: - - Check if namespace exists: - ```bash - oc get namespace {namespace} --ignore-not-found - ``` - - Check if subscription exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If not found, display error: "Operator {operator-name} is not installed in namespace {namespace}" - - List what will be uninstalled - -4. **Display Uninstallation Plan**: - - Show operator details: - ```bash - oc get subscription {operator-name} -n {namespace} -o yaml - oc get csv -n {namespace} - ``` - - Display what will be removed: - - Subscription name and namespace - - CSV name and version - - Deployments (if any) - - CRDs (if `--remove-crds` flag is set) - - Namespace (if `--remove-namespace` flag is set) - -5. **Request User Confirmation** (unless `--force` flag is set): - - Display warning: - ``` - WARNING: You are about to uninstall {operator-name} from namespace {namespace}. - This will remove: - - Subscription: {subscription-name} - - ClusterServiceVersion: {csv-name} - - Operator deployments - [- Custom Resource Definitions (if --remove-crds is set)] - [- Namespace {namespace} (if --remove-namespace is set)] - - Are you sure you want to continue? (yes/no) - ``` - - Wait for user confirmation - - If user says no, abort operation - -6. **Delete Subscription**: - - Remove the operator's subscription: - ```bash - oc delete subscription {operator-name} -n {namespace} - ``` - - Verify deletion: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - Display result - -7. **Delete ClusterServiceVersion (CSV)**: - - Get the CSV name: - ```bash - oc get csv -n {namespace} -o jsonpath='{.items[?(@.spec.displayName contains "{operator-name}")].metadata.name}' - ``` - - Delete the CSV: - ```bash - oc delete csv {csv-name} -n {namespace} - ``` - - This will automatically remove operator deployments - - Verify CSV is deleted: - ```bash - oc get csv -n {namespace} --ignore-not-found - ``` - -8. **Remove Operator Deployments** (if still present): - - List deployments created by the operator: - ```bash - oc get deployments -n {namespace} - ``` - - For operators like cert-manager with labeled resources: - ```bash - oc delete deployment -n {namespace} -l app.kubernetes.io/instance={operator-base-name} - ``` - - Verify deployments are deleted: - ```bash - oc get deployments -n {namespace} - ``` - -8.5. **Check for Orphaned Custom Resources** (before removing CRDs): - - Get list of CRDs managed by the operator from CSV: - ```bash - oc get csv -n {namespace} -o jsonpath='{.items[0].spec.customresourcedefinitions.owned[*].name}' - ``` - - For each CRD, search for CR instances across all namespaces: - ```bash - oc get --all-namespaces --ignore-not-found - ``` - - If CRs exist, list them with details: - ``` - WARNING: Found custom resources that may prevent clean uninstallation: - - namespace-1/ (kind: ) - - namespace-2/ (kind: ) - - These resources should be deleted before uninstalling the operator. - Do you want to delete these custom resources? (yes/no) - ``` - - If user confirms, delete each CR: - ```bash - oc delete -n - ``` - - This prevents namespace from getting stuck in Terminating state - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - -9. **Remove Custom Resource Definitions** (if `--remove-crds` flag is set): - - **WARNING**: Display critical warning to user: - ``` - WARNING: Removing CRDs will delete ALL custom resources of these types across the entire cluster! - This action is irreversible and will affect all namespaces. - - Are you absolutely sure you want to remove CRDs? (yes/no) - ``` - - If user confirms, proceed with CRD removal - - Get list of CRDs owned by the operator: - ```bash - oc get csv {csv-name} -n {namespace} -o jsonpath='{.spec.customresourcedefinitions.owned[*].name}' - ``` - - For each CRD, check if custom resources exist: - ```bash - oc get {crd-name} --all-namespaces --ignore-not-found - ``` - - Display warning if custom resources exist - - Delete CRDs: - ```bash - oc delete crd {crd-name} - ``` - -10. **Remove Namespace** (if `--remove-namespace` flag is set): - - **WARNING**: Display warning: - ``` - WARNING: Removing namespace {namespace} will delete all resources in this namespace! - - Are you sure you want to remove namespace {namespace}? (yes/no) - ``` - - If user confirms: - ```bash - oc delete namespace {namespace} - ``` - - Monitor namespace deletion with timeout: - ```bash - oc wait --for=delete namespace/{namespace} --timeout=120s - ``` - - If namespace gets stuck in "Terminating" state after 120 seconds: - - Check for resources preventing deletion: - ```bash - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - ``` - - Check for finalizers on the namespace: - ```bash - oc get namespace {namespace} -o jsonpath='{.metadata.finalizers}' - ``` - - Display helpful error message: - ``` - ERROR: Namespace {namespace} is stuck in Terminating state. - - Possible causes: - - Resources with finalizers preventing deletion - - API services that are unavailable - - Custom resources that cannot be deleted - - To diagnose and fix, run: /olm:diagnose {operator-name} {namespace} - - Manual troubleshooting: - 1. Check remaining resources: - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - - 2. Check namespace finalizers: - oc get namespace {namespace} -o yaml | grep -A5 finalizers - - WARNING: Do NOT force-delete the namespace as it can lead to unstable cluster behavior. - See: https://access.redhat.com/solutions/4165791 - ``` - - Exit with error code - - Note: OperatorGroup will be automatically deleted with the namespace - -11. **Post-Uninstall Verification**: - - Verify all resources are cleaned up: - ```bash - oc get subscription,csv,installplan -n {namespace} --ignore-not-found - ``` - - Check if any CRDs remain (if they were supposed to be deleted): - ```bash - oc get crd | grep - ``` - - If uninstalling without `--remove-namespace`, check namespace is clean: - ```bash - oc get all -n {namespace} - ``` - - Display any remaining resources with suggestions for cleanup - -12. **Display Uninstallation Summary**: - - Show what was successfully removed: - ``` - ✓ Uninstallation Summary: - ✓ Subscription '{operator-name}' deleted - ✓ CSV '{csv-name}' deleted - ✓ Operator deployments removed - [✓ X custom resources deleted] - [✓ Y CRDs removed] - [✓ Namespace '{namespace}' deleted] - ``` - - If CRDs or namespace were NOT removed, provide instructions: - ``` - Note: The following resources were NOT removed: - - Custom Resource Definitions (use --remove-crds to remove) - - Namespace {namespace} (use --remove-namespace to remove) - - To completely remove all operator resources, run: - /olm:uninstall {operator-name} {namespace} --remove-crds --remove-namespace - ``` - - **Important warning about reinstallation**: - ``` - IMPORTANT: Before reinstalling this operator, verify all resources are cleaned: - - oc get subscription,csv,installplan -n {namespace} - oc get crd | grep - - Failure to completely uninstall may cause reinstallation issues. - See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - ``` - -## Return Value -- **Success**: Operator uninstalled successfully with summary of removed resources -- **Partial Success**: Some resources removed with warnings about remaining resources -- **Error**: Uninstallation failed with specific error message -- **Format**: Structured output showing: - - Subscription deletion status - - CSV deletion status - - Deployment removal status - - CRD removal status (if applicable) - - Namespace deletion status (if applicable) - -## Examples - -1. **Uninstall cert-manager-operator (basic)**: - ``` - /olm:uninstall openshift-cert-manager-operator - ``` - -2. **Uninstall with custom namespace**: - ``` - /olm:uninstall openshift-cert-manager-operator my-cert-manager - ``` - -3. **Complete cleanup including namespace**: - ``` - /olm:uninstall openshift-cert-manager-operator cert-manager-operator --remove-crds --remove-namespace - ``` - This performs a complete cleanup of all operator-related resources. - -4. **Force uninstall without prompts**: - ``` - /olm:uninstall openshift-cert-manager-operator cert-manager-operator --force - ``` - Skips all confirmation prompts (use with caution!). - -## Arguments -- **$1** (operator-name): The name of the operator to uninstall (required) - - Example: "openshift-cert-manager-operator" - - Must match the Subscription name -- **$2** (namespace): The namespace where operator is installed (optional) - - Default: `{operator-name}` (operator name without "openshift-" prefix) - - Example: "cert-manager-operator" -- **$3+** (flags): Optional flags (can combine multiple): - - `--remove-crds`: Remove Custom Resource Definitions (WARNING: affects entire cluster) - - `--remove-namespace`: Remove the operator's namespace and all its resources - - `--force`: Skip all confirmation prompts (use with caution) - -## Safety Features - -1. **Multiple Confirmations**: Separate confirmations for CRD and namespace removal -2. **Detailed Warnings**: Clear warnings about the scope of deletions -3. **Verification Steps**: Checks that resources exist before attempting deletion -4. **Summary Report**: Detailed summary of what was and wasn't removed -5. **Graceful Failures**: Continues with remaining steps if individual deletions fail - -## Troubleshooting - -- **Subscription not found**: Verify the operator name and namespace: - ```bash - oc get subscriptions --all-namespaces | grep {operator-name} - ``` -- **CSV won't delete**: Check for finalizers: - ```bash - oc get csv {csv-name} -n {namespace} -o yaml | grep finalizers - ``` - If finalizers are present, they may be waiting for resources to be cleaned up. Check operator logs and events. - -- **Namespace stuck in Terminating**: This is a common issue after operator uninstallation. - ```bash - # Find remaining resources - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - - # Check namespace finalizers - oc get namespace {namespace} -o yaml | grep -A5 finalizers - ``` - **IMPORTANT**: Do not force-delete the namespace. This can cause cluster instability. - Instead, use `/olm:diagnose {operator-name} {namespace}` to diagnose and fix the issue. - -- **CRDs won't delete**: Check for remaining custom resources: - ```bash - oc get {crd-name} --all-namespaces - ``` - CRDs cannot be deleted while CR instances exist. Delete all CRs first. - -- **Custom resources won't delete**: Some CRs may have finalizers preventing deletion: - ```bash - oc get -n -o yaml | grep finalizers - ``` - The operator controller (if still running) should remove finalizers. If operator is already deleted, you may need to manually patch the CR to remove finalizers (use with extreme caution). - -- **Permission denied**: Ensure you have cluster-admin privileges for CRD deletion: - ```bash - oc auth can-i delete crd - ``` - -- **Reinstallation fails after uninstall**: This usually means cleanup was incomplete. - Run these checks before reinstalling: - ```bash - # Check for remaining subscriptions/CSVs - oc get subscription,csv -n {namespace} - - # Check for remaining CRDs - oc get crd | grep - - # Check if namespace is clean or stuck - oc get namespace {namespace} - ``` - See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - -## Related Commands - -- `/olm:install` - Install a day-2 operator -- `/olm:list` - List installed operators -- `/olm:status` - Check operator status before uninstalling -- `/olm:diagnose` - Diagnose and fix uninstallation issues -- `/olm:upgrade` - Upgrade an operator - -## Additional Resources - -- [Red Hat OpenShift: Deleting Operators from a cluster](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-deleting-operators-from-a-cluster) -- [Red Hat OpenShift: Reinstalling Operators after failed uninstallation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - diff --git a/plugins/olm/commands/upgrade.md b/plugins/olm/commands/upgrade.md deleted file mode 100644 index 75434f615..000000000 --- a/plugins/olm/commands/upgrade.md +++ /dev/null @@ -1,349 +0,0 @@ ---- -description: Update an operator to the latest version or switch channels -argument-hint: [namespace] [--channel=] [--approve] ---- - -## Name -olm:upgrade - -## Synopsis -``` -/olm:upgrade [namespace] [--channel=] [--approve] -``` - -## Description -The `olm:upgrade` command updates an installed operator to the latest version in its current channel or switches to a different channel. It can also approve pending InstallPlans for operators with manual approval mode. - -This command helps you: -- Update operators to the latest version in their channel -- Switch operators to different channels (e.g., stable to tech-preview) -- Approve pending upgrade InstallPlans for manual approval mode -- Monitor upgrade progress -- Rollback on failure (if possible via OLM) - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - Name of the operator to upgrade - - `$2`: Namespace (optional) - Namespace where operator is installed - - If not provided, searches for the operator across all namespaces - - `$3+`: Flags (optional): - - `--channel=`: Switch to a different channel - - `--approve`: Automatically approve pending InstallPlan (for manual approval mode) - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has sufficient privileges - -3. **Locate Operator**: - - If namespace provided, verify operator exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If no namespace provided, search across all namespaces: - ```bash - oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' - ``` - - If not found, display error with suggestions - - If multiple instances found, prompt user to specify namespace - -4. **Get Current State**: - - Get current Subscription: - ```bash - oc get subscription {operator-name} -n {namespace} -o json - ``` - - Extract: - - Current channel: `.spec.channel` - - Install plan approval: `.spec.installPlanApproval` - - Installed CSV: `.status.installedCSV` - - Current CSV: `.status.currentCSV` - - Get current CSV version: - ```bash - oc get csv {installed-csv} -n {namespace} -o jsonpath='{.spec.version}' - ``` - -5. **Check for Available Updates**: - - Get PackageManifest: - ```bash - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - - Extract available channels and their latest versions - - If `--channel` flag is specified, verify channel exists - - If no channel flag, check for updates in current channel - - Compare current version with latest available version - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators - -6. **Display Upgrade Plan**: - ``` - Operator Upgrade Plan: - - Operator: {display-name} - Namespace: {namespace} - Current Version: {current-version} - Current Channel: {current-channel} - - [If switching channels:] - Target Channel: {new-channel} - Target Version: {new-version} - - [If upgrading in same channel:] - Latest Version: {latest-version} (in channel: {current-channel}) - - Approval Mode: {Automatic|Manual} - ``` - -7. **Check for Pending InstallPlans** (for manual approval mode): - - Get pending InstallPlans: - ```bash - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - ``` - - If pending InstallPlan exists and `--approve` flag is set: - - Display InstallPlan details - - Approve the InstallPlan (skip to step 9) - - If pending InstallPlan exists and no `--approve` flag: - ``` - ⏸️ Pending InstallPlan found (requires manual approval) - - InstallPlan: {installplan-name} - Target Version: {target-version} - - To approve: /olm:upgrade {operator-name} {namespace} --approve - Or use: /olm:approve {operator-name} {namespace} - ``` - - Exit, waiting for user to approve - -8. **Perform Channel Switch** (if `--channel` flag provided): - - Confirm with user (unless `--force` flag): - ``` - WARNING: Switching channels may upgrade or downgrade the operator. - - Current: {current-channel} ({current-version}) - Target: {new-channel} ({target-version}) - - Continue? (yes/no) - ``` - - Update Subscription to new channel: - ```bash - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{new-channel}"}}' - ``` - - Display confirmation: - ``` - ✓ Subscription updated to channel: {new-channel} - ``` - -9. **Approve Pending InstallPlan** (if `--approve` flag or automatic approval): - - If approval mode is Manual and `--approve` flag is set: - ```bash - oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - ``` - - Display approval confirmation: - ``` - ✓ InstallPlan approved: {installplan-name} - ``` - -10. **Monitor Upgrade Progress**: - - Wait for new InstallPlan to be created (if switching channels): - ```bash - oc get installplan -n {namespace} -w --timeout=60s - ``` - - Wait for new CSV to reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -w --timeout=300s - ``` - - Display progress updates: - ``` - 🔄 Upgrade in progress... - ⏳ Waiting for InstallPlan to complete... - ⏳ New CSV installing: {new-csv-name} - ⏳ Old CSV replacing: {old-csv-name} - ``` - - Poll every 10 seconds to check status - - Timeout: 10 minutes for upgrade to complete - -11. **Verify Upgrade Success**: - - Check new CSV status: - ```bash - oc get csv -n {namespace} -o json - ``` - - Verify new CSV phase is "Succeeded" - - Get new version: - ```bash - oc get csv {new-csv-name} -n {namespace} -o jsonpath='{.spec.version}' - ``` - - Check deployments are healthy: - ```bash - oc get deployments -n {namespace} - ``` - - Check pods are running: - ```bash - oc get pods -n {namespace} - ``` - -12. **Display Upgrade Summary**: - ``` - ✓ Operator Upgrade Complete! - - Operator: {display-name} - Namespace: {namespace} - Previous Version: {old-version} - Current Version: {new-version} - Channel: {channel} - - Deployment Status: - - {deployment-1}: 1/1 replicas ready - - {deployment-2}: 1/1 replicas ready - - To check status: /olm:status {operator-name} {namespace} - ``` - -13. **Handle Upgrade Failures**: - - If upgrade fails or times out: - ``` - ❌ Operator upgrade failed - - Current State: - - CSV: {csv-name} (Phase: {phase}) - - Message: {error-message} - - Troubleshooting steps: - 1. Check CSV status: oc describe csv {csv-name} -n {namespace} - 2. Check events: oc get events -n {namespace} --sort-by='.lastTimestamp' - 3. Check InstallPlan: oc get installplan -n {namespace} - 4. Run diagnostics: /olm:diagnose {operator-name} {namespace} - - To rollback (if OLM supports): - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{old-channel}"}}' - ``` - -## Return Value -- **Success**: Operator upgraded successfully with new version details -- **Pending Approval**: Upgrade waiting for manual approval with instructions -- **No Update Available**: Operator is already at the latest version -- **Error**: Upgrade failed with specific error message and troubleshooting guidance -- **Format**: Structured output showing: - - Previous and current versions - - Channel information - - Deployment and pod status - - Next steps or related commands - -## Examples - -1. **Check for and install updates in current channel**: - ``` - /olm:upgrade openshift-cert-manager-operator - ``` - -2. **Upgrade with specific namespace**: - ``` - /olm:upgrade external-secrets-operator eso-operator - ``` - -3. **Switch to a different channel**: - ``` - /olm:upgrade openshift-cert-manager-operator cert-manager-operator --channel=tech-preview-v1.14 - ``` - This switches from stable-v1 to tech-preview-v1.14 channel. - -4. **Approve pending upgrade (manual approval mode)**: - ``` - /olm:upgrade openshift-cert-manager-operator --approve - ``` - -5. **Switch channel and approve in one command**: - ``` - /olm:upgrade prometheus prometheus-operator --channel=beta --approve - ``` - -## Arguments -- **$1** (operator-name): Name of the operator to upgrade (required) - - Example: "openshift-cert-manager-operator" - - Must match the operator's Subscription name -- **$2** (namespace): Namespace where operator is installed (optional) - - If not provided, searches all namespaces - - Example: "cert-manager-operator" -- **$3+** (flags): Optional flags - - `--channel=`: Switch to specified channel - - Example: `--channel=stable-v1`, `--channel=tech-preview` - - Triggers upgrade/downgrade to the version in that channel - - `--approve`: Automatically approve pending InstallPlan - - Only needed for operators with Manual approval mode - - Equivalent to `/olm:approve` command - -## Notes - -- **Automatic Updates**: Operators with `installPlanApproval: Automatic` will upgrade automatically when new versions are available in their channel -- **Manual Approval**: Operators with `installPlanApproval: Manual` require explicit approval via `--approve` flag or `/olm:approve` command -- **Channel Switching**: Changing channels may result in upgrade or downgrade depending on the versions in each channel -- **Rollback**: OLM has limited rollback support. Switching back to the previous channel may work, but data migration issues may occur -- **Upgrade Timing**: Upgrades happen according to the operator's upgrade strategy (some may cause downtime) - -## Troubleshooting - -- **No updates available**: - ```bash - # Check current version - oc get csv -n {namespace} - - # Check available versions - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - -- **Upgrade stuck or pending**: - ```bash - # Check InstallPlan status - oc get installplan -n {namespace} - - # Check for events - oc get events -n {namespace} --sort-by='.lastTimestamp' | tail -20 - ``` - -- **Manual approval required**: - ```bash - # List pending InstallPlans - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - - # Approve specific InstallPlan - /olm:approve {operator-name} {namespace} - ``` - -- **Upgrade failed**: - ```bash - # Check CSV status - oc describe csv -n {namespace} - - # Check operator logs - oc logs -n {namespace} deployment/{operator-deployment} - - # Run diagnostics - /olm:diagnose {operator-name} {namespace} - ``` - -- **Rollback needed**: - - OLM doesn't have built-in rollback - - Can try switching back to previous channel, but may have issues: - ```bash - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{old-channel}"}}' - ``` - - Consider backup/restore of custom resources before upgrading - -## Related Commands - -- `/olm:status ` - Check current version and available updates -- `/olm:approve ` - Approve pending InstallPlans -- `/olm:install ` - Install an operator -- `/olm:diagnose ` - Diagnose upgrade issues - -## Additional Resources - -- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) -- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - diff --git a/plugins/openshift/commands/destroy-cluster.md b/plugins/openshift/commands/destroy-cluster.md deleted file mode 100644 index 7c5b85de6..000000000 --- a/plugins/openshift/commands/destroy-cluster.md +++ /dev/null @@ -1,360 +0,0 @@ ---- -description: Destroy an OpenShift cluster created by create-cluster command -argument-hint: "[install-dir]" ---- - -## Name -openshift:destroy-cluster - -## Synopsis -``` -/openshift:destroy-cluster [install-dir] -``` - -## Description - -The `destroy-cluster` command safely destroys an OpenShift Container Platform (OCP) cluster that was previously created using the `/openshift:create-cluster` command. It locates the appropriate installer binary, verifies the cluster information, and performs cleanup of all cloud resources. - -This command is useful for: -- Cleaning up development/test clusters after testing -- Removing failed cluster installations -- Freeing up cloud resources and quotas - -**⚠️ WARNING**: This operation is **irreversible** and will permanently delete: -- All cluster resources (VMs, load balancers, storage, etc.) -- All data stored in the cluster -- All configuration and credentials -- DNS records (if managed by the installer) - -## Prerequisites - -Before using this command, ensure you have: - -1. **Installation directory** from the original cluster creation - - Contains the cluster metadata and terraform state - - Located at `{cluster-name}-install-{timestamp}` by default - -2. **OpenShift installer binary** that matches the cluster version - - Should be available at `~/.openshift-installers/openshift-install-{version}` - - Same version used to create the cluster - -3. **Cloud Provider Credentials** still configured and valid - - Same credentials used during cluster creation - - Must have permissions to delete resources - -4. **Network connectivity** to the cloud provider - - Required to communicate with cloud APIs - -## Arguments - -- **install-dir** (optional): Path to the cluster installation directory - - Default: Interactive prompt to select from available installation directories - - Must contain cluster metadata files (metadata.json, terraform.tfstate, etc.) - - Example: `./my-cluster-install-20251028-120000` - -## Implementation - -The command performs the following steps: - -### 1. Locate Installation Directory - -If `install-dir` is not provided: -- Search for installation directories in the current directory -- Look for directories matching pattern `*-install-*` or containing `.openshift_install_state.json` -- Present a list of found directories to the user for selection -- Allow user to manually enter a path if directory not found - -If `install-dir` is provided: -- Validate the directory exists -- Verify it contains cluster metadata files - -### 2. Extract Cluster Information - -Read cluster details from the installation directory: -```bash -# Read cluster metadata -if [ -f "$INSTALL_DIR/metadata.json" ]; then - CLUSTER_NAME=$(jq -r '.clusterName' "$INSTALL_DIR/metadata.json") - INFRA_ID=$(jq -r '.infraID' "$INSTALL_DIR/metadata.json") - PLATFORM=$(jq -r '.platform' "$INSTALL_DIR/metadata.json") -fi - -# Try to extract version from cluster-info or log files -VERSION=$(grep -oE 'openshift-install.*v[0-9]+\.[0-9]+\.[0-9]+' "$INSTALL_DIR/.openshift_install.log" | head -1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+[^"]*' | head -1) -``` - -### 3. Display Cluster Information and Confirm - -Show the user what will be destroyed: -``` -Cluster Information: - Name: ${CLUSTER_NAME} - Infrastructure ID: ${INFRA_ID} - Platform: ${PLATFORM} - Installation Directory: ${INSTALL_DIR} - Version: ${VERSION} - -⚠️ WARNING: This will permanently destroy the cluster and all its resources! - -This action will delete: - - All cluster VMs and compute resources - - Load balancers and networking resources - - Storage volumes and persistent data - - DNS records - - All cluster configuration - -Are you sure you want to destroy this cluster? (yes/no): -``` - -**Important**: Require the user to type "yes" (not just "y") to confirm destruction. - -### 4. Locate the Correct Installer - -Find the installer binary that matches the cluster version: -```bash -INSTALLER_DIR="${HOME}/.openshift-installers" -INSTALLER_PATH="$INSTALLER_DIR/openshift-install-${VERSION}" - -# Check if the version-specific installer exists -if [ ! -f "$INSTALLER_PATH" ]; then - echo "Warning: Installer for version ${VERSION} not found at ${INSTALLER_PATH}" - echo "Searching for alternative installers..." - - # Look for any installer in the installers directory - AVAILABLE_INSTALLERS=$(find "$INSTALLER_DIR" -name "openshift-install-*" -type f 2>/dev/null) - - if [ -n "$AVAILABLE_INSTALLERS" ]; then - echo "Found installers:" - echo "$AVAILABLE_INSTALLERS" - echo "" - echo "You may use a different version installer, but this may cause issues." - echo "Would you like to:" - echo " 1. Use an available installer from the list above" - echo " 2. Extract the correct installer from the release image" - echo " 3. Cancel the operation" - else - echo "No installers found. Would you like to extract the installer? (yes/no):" - fi -fi - -# Verify installer works -"$INSTALLER_PATH" version -``` - -### 5. Backup Important Files (Optional) - -Offer to backup key files before destruction: -``` -Would you like to backup cluster information before destroying? (yes/no): -``` - -If yes, create a backup: -```bash -BACKUP_DIR="${INSTALL_DIR}-backup-$(date +%Y%m%d-%H%M%S)" -mkdir -p "$BACKUP_DIR" - -# Backup key files -cp "$INSTALL_DIR/metadata.json" "$BACKUP_DIR/" 2>/dev/null -cp "$INSTALL_DIR/auth/kubeconfig" "$BACKUP_DIR/" 2>/dev/null -cp "$INSTALL_DIR/auth/kubeadmin-password" "$BACKUP_DIR/" 2>/dev/null -cp "$INSTALL_DIR/.openshift_install.log" "$BACKUP_DIR/" 2>/dev/null -cp "$INSTALL_DIR/install-config.yaml.backup" "$BACKUP_DIR/" 2>/dev/null - -echo "Backup created at: $BACKUP_DIR" -``` - -### 6. Run Cluster Destroy - -Execute the destroy command: -```bash -cd "$INSTALL_DIR" - -echo "Starting cluster destruction..." -echo "This may take 10-15 minutes..." - -"$INSTALLER_PATH" destroy cluster --dir=. --log-level=debug - -DESTROY_EXIT_CODE=$? -``` - -Monitor the destruction progress and display status updates. - -### 7. Verify Cleanup - -After the destroy command completes: - -1. **Check exit code**: - ```bash - if [ $DESTROY_EXIT_CODE -eq 0 ]; then - echo "✅ Cluster destroyed successfully" - else - echo "❌ Cluster destruction failed with exit code: $DESTROY_EXIT_CODE" - echo "Check logs at: $INSTALL_DIR/.openshift_install.log" - fi - ``` - -2. **Verify cloud resources** (platform-specific): - - AWS: Check for lingering resources with tag `kubernetes.io/cluster/${INFRA_ID}` - - Azure: Verify resource group deletion - - GCP: Check project for remaining resources - -3. **List any remaining resources**: - ``` - If any resources remain, provide commands to manually clean them up. - ``` - -### 8. Cleanup Installation Directory (Optional) - -Ask the user if they want to remove the installation directory: -``` -The cluster has been destroyed. Would you like to delete the installation directory? (yes/no): - Directory: $INSTALL_DIR - Size: $(du -sh "$INSTALL_DIR" | cut -f1) -``` - -If yes: -```bash -rm -rf "$INSTALL_DIR" -echo "Installation directory removed" -``` - -If no: -```bash -echo "Installation directory preserved at: $INSTALL_DIR" -echo "You can manually remove it later with: rm -rf $INSTALL_DIR" -``` - -### 9. Display Summary - -Show final summary: -``` -Cluster Destruction Summary: - Cluster Name: ${CLUSTER_NAME} - Status: Successfully destroyed - Platform: ${PLATFORM} - Duration: ${DURATION} - Backup: ${BACKUP_DIR} (if created) - -Next steps: - - Verify your cloud console for any lingering resources - - Check your cloud billing to ensure resources are no longer incurring charges - - Remove installation directory if not already deleted: ${INSTALL_DIR} -``` - -## Error Handling - -If destruction fails, the command should: - -1. **Capture error logs** from `.openshift_install.log` -2. **Identify the failure point**: - - Timeout waiting for resource deletion - - Permission errors - - API rate limiting - - Network connectivity issues - - Resources locked or in use -3. **Provide recovery options**: - - Retry the destroy operation - - Manual cleanup instructions for specific resources - - Contact support if critical errors occur - -Common failure scenarios: - -**Timeout errors**: -```bash -# Some resources may take longer to delete -# Retry the destroy command: -"$INSTALLER_PATH" destroy cluster --dir="$INSTALL_DIR" -``` - -**Permission errors**: -``` -Error: Cloud credentials may have expired or lack permissions -Solution: - 1. Verify cloud credentials are still valid - 2. Check IAM permissions for resource deletion - 3. Re-run the destroy command after fixing credentials -``` - -**Partial destruction**: -``` -Warning: Some resources could not be deleted automatically. - -Remaining resources: - - Load balancer: ${LB_NAME} - - Security group: ${SG_NAME} - - S3 bucket: ${BUCKET_NAME} - -Manual cleanup commands: - [Platform-specific commands to delete remaining resources] -``` - -## Examples - -### Example 1: Destroy cluster with interactive directory selection -``` -/openshift:destroy-cluster -``` -The command will search for installation directories and prompt you to select one. - -### Example 2: Destroy cluster with specific directory -``` -/openshift:destroy-cluster ./my-cluster-install-20251028-120000 -``` - -### Example 3: Destroy cluster with full path -``` -/openshift:destroy-cluster /home/user/clusters/test-cluster-install-20251028-120000 -``` - -## Common Issues - -1. **Installation directory not found**: - - Ensure you're in the correct directory - - Provide the full path to the installation directory - - Check if the directory was moved or renamed - -2. **Installer binary not found**: - - The command will help you extract the correct installer - - Alternatively, manually place the installer in `~/.openshift-installers/` - -3. **Cloud credentials expired**: - - Refresh your cloud credentials - - Re-authenticate with the cloud provider CLI - - Re-run the destroy command - -4. **Resources already deleted manually**: - - The destroy command may fail if resources were manually deleted - - Check the logs and manually clean up any remaining resources - - Remove the installation directory manually - -5. **Destroy hangs or times out**: - - Some resources may take longer to delete (especially load balancers) - - Wait for the operation to complete (can take 15-30 minutes) - - If truly stuck, cancel and retry - - Check cloud console for resource status - -## Safety Features - -This command includes several safety measures: - -1. **Confirmation required**: Must type "yes" to proceed -2. **Cluster information displayed**: Shows what will be destroyed before proceeding -3. **Backup option**: Offers to backup important files -4. **Validation checks**: Verifies installation directory and metadata -5. **Detailed logging**: All operations logged for troubleshooting -6. **Error recovery**: Provides manual cleanup instructions if automated cleanup fails - -## Return Value - -- **Success**: Returns 0 and displays destruction summary -- **Failure**: Returns non-zero and displays error diagnostics with recovery instructions - -## See Also - -- `/openshift:create-cluster` - Create a new OCP cluster -- OpenShift Documentation: https://docs.openshift.com/container-platform/latest/installing/ -- Platform-specific cleanup guides - -## Arguments: - -- **$1** (install-dir): Path to the cluster installation directory created by create-cluster (optional, interactive if not provided) diff --git a/plugins/origin/.claude-plugin/plugin.json b/plugins/origin/.claude-plugin/plugin.json deleted file mode 100644 index 285629b2b..000000000 --- a/plugins/origin/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "origin", - "description": "Helpers for openshift/origin development.", - "version": "0.0.2", - "author": { - "name": "openshift" - } -} diff --git a/plugins/origin/README.md b/plugins/origin/README.md deleted file mode 100644 index a60100d35..000000000 --- a/plugins/origin/README.md +++ /dev/null @@ -1,124 +0,0 @@ -# Origin Plugin - -Utilities and workflow helpers for developing and reviewing changes in the -openshift/origin repository. -This plugin focuses on improving test quality, code consistency, and CI suite -integration for Origin contributions. - -## Commands - -### /origin:two-node-origin-pr-helper - -Expert review tool for PRs that add or modify Two Node (Fencing or Arbiter) tests -under test/extended/two_node/. - -This command performs: - -- Automatic discovery of changed Two Node test files -- Analysis of Ginkgo Describe/It blocks, suite tags, and Serial annotations -- Review of test logic, determinism, cleanup behavior, and structure -- Suggestions for reusing existing Origin and Kubernetes utilities -- Identification of duplicated logic that should use shared helpers -- Recommendations for suite placement and Serial usage -- Recommendations for CI lane coverage in openshift/release -- Generation of ready-to-paste PR text for both Origin and Release repositories - -Use this helper when contributing to Origin’s Two Node test suite or reviewing PRs -that affect Two Node behavior. - -See the commands/ directory for full documentation. - -## Installation - -### From the Claude Code Plugin Marketplace - -1. Add the OpenShift ai-helpers marketplace: - - /plugin marketplace add openshift-eng/ai-helpers - -2. Install the origin plugin: - - /plugin install origin@ai-helpers - -3. Use the command: - - /origin:two-node-origin-pr-helper - -## Available Commands - -### Two Node PR Review - -#### /origin:two-node-origin-pr-helper — Review Two Node Tests in Origin - -This command performs a full expert review of PRs that modify or add Two Node -(Fencing or Arbiter) tests under test/extended/two_node/. - -The helper covers: - -- Code correctness and logical consistency -- Ginkgo test structure and best practices -- Suite tagging and Serial analysis -- Utility/helper reuse (Origin + Kubernetes) -- CI suite and lane coverage recommendations -- PR description generation - -See commands/two-node-origin-pr-helper.md for full documentation. - -## Development - -### Adding New Commands - -To add a new command to this plugin: - -1. Create a markdown file in commands/: - - touch plugins/origin/commands/your-command.md - -2. Use existing commands as a template and include sections: - - - Name - - Synopsis - - Description - - Implementation behavior - - Return value / output structure - - Examples - - Arguments - - Error handling - - Additional context if needed - -3. Test the command: - - /origin:your-command - -## Plugin Structure - -plugins/origin/ -├── .claude-plugin/ -│ └── plugin.json -├── commands/ -│ └── two-node-origin-pr-helper.md -└── README.md - -## Related Plugins - -- openshift — General OpenShift development and CI helpers -- ci — Prow/CI-related workflow helpers -- git — Git workflow helpers -- jira — Jira automation helpers -- utils — General-purpose utilities - -## Contributing - -Contributions are welcome. - -When adding Origin-specific commands: - -- Ensure the workflow relates directly to openshift/origin -- Follow existing documentation patterns -- Provide actionable examples and behavior explanations -- Use realistic Origin repository paths and test patterns -- Update this README with any new commands - -## License - -See [LICENSE](../../LICENSE) for details. diff --git a/plugins/origin/commands/two-node-origin-pr-helper.md b/plugins/origin/commands/two-node-origin-pr-helper.md deleted file mode 100644 index 4bf35e8fb..000000000 --- a/plugins/origin/commands/two-node-origin-pr-helper.md +++ /dev/null @@ -1,174 +0,0 @@ ---- -description: Expert review tool for PRs that add or modify Two Node (Fencing or Arbiter) tests under test/extended/two_node/ in openshift/origin. -argument-hint: "[--url PR_URL] [] [--depth quick|full]" ---- - -## Name - -/origin:two-node-origin-pr-helper — Review Two Node (Fencing/Arbiter) tests in openshift/origin. - -## Synopsis -``` -/origin:two-node-origin-pr-helper [--url PR_URL] [] [--depth quick|full] -``` -## Description - -The /origin:two-node-origin-pr-helper command is an expert review tool for PRs that add or modify -Two Node (Fencing or Arbiter) tests under test/extended/two_node/ in openshift/origin. - -It: - -- Discovers changed Two Node test files from the current branch. -- Analyzes Ginkgo Describe / Context / It blocks, suite tags, and [Serial] markers. -- Reviews test logic, structure, cleanup, and determinism. -- Suggests reuse of existing Origin and Kubernetes helpers instead of ad-hoc code. -- Recommends suite + [Serial] tagging and CI coverage. -- Generates ready-to-paste PR description text for the Origin PR. -- Suggests CI lane characteristics for openshift/release (without generating full PR text). - -Use this command when creating or reviewing Origin PRs that touch the Two Node test suite and you -want a focused, reproducible review of test design, helper usage, and CI integration. - -This is a specialized Origin review helper focused on Two Node tests and is intended as a building -block toward a future generic Origin review command. - -## Implementation - -The command should behave as follows. - -### 1. Argument handling - -Parse arguments from the invocation: - -- --url: - - Optional full PR URL (example: ) - - When provided, this takes precedence over any local git information. - -- (optional positional): - - Optional PR number (example: 30510) - -- --depth: - - quick: short, high-level summary - - full: detailed four-section output (default) - -Default behavior: - -- If --url is provided, use that PR. -- Else if is provided, use that PR in the current repo. -- Else infer the PR from the current git repository remote and branch name. -- Fail with a clear error message if the PR cannot be determined. - -### 2. Automatically discover relevant changes - -Assume the command is run inside a local checkout of the repo. - -- Determine changed files using git diff. -- Filter to Go files under test/extended/two_node/. -- Parse: - - Ginkgo Describe / Context / It blocks - - Suite tags - - [Serial] markers - - Helper imports - -### 3. Review test design and correctness - -For each test: - -- Validate alignment between intent and implementation. -- Validate degraded vs non-degraded behavior. -- Validate fencing vs arbiter semantics. -- Validate quorum, failover, and recovery expectations. - -Do not assume helper existence. Infer from imports and logic only. - -### 4. Suggest reuse of utilities and helpers - -Look for re-implemented logic where helpers already exist. - -Examples: - -- Origin utilities under github.com/openshift/origin/test/extended/util -- Kubernetes helpers under k8s.io/apimachinery and k8s.io/utils - -Call out: - -- Correct helper usage -- Missed reuse opportunities -- Duplication that should become shared Two Node helpers - -### 5. Evaluate structure and readability - -Review: - -- Describe / Context / It hierarchy -- By(...) usage -- Assertion clarity -- Avoidance of time.Sleep in favor of polling - -### 6. Recommend suite and Serial annotations - -- Prefer [Suite:openshift/two-node] for Two Node tests. -- Recommend [Serial] for: - - Cluster-scoped mutations - - Reboots - - Degradation or fencing actions - -- Recommend parallel for isolated, namespaced tests. - -Always explain why. - -### 7. Propose CI lane coverage - -- Determine if existing CI already covers the tests. -- If not, propose: - - Topology - - TEST_SUITE - - Feature gates - - Blocking vs periodic vs optional - -Do not hard-code lane names. - -### 8. Generate ready-to-paste text - -Produce: - -- Origin PR summary text -- Optional CI lane summary text (not a full release PR) - -The command is static and requires no cluster access. - ---- - -## Expected input - -/origin:two-node-origin-pr-helper --depth full -/origin:two-node-origin-pr-helper 30510 --depth full -/origin:two-node-origin-pr-helper --url --depth quick - ---- - -## Output structure - -Always respond in four sections: - -1. Summary of changes -2. Review of tests (design, logic, reuse) -3. Suite, Serial, and CI recommendations -4. Ready-to-paste text - -Respect --depth only: - -- quick → compact output -- full → detailed output - ---- - -## Example 1 — Degraded Two Node Fencing tests - -/origin:two-node-origin-pr-helper 30510 --depth full - ---- - -## Example 2 — Two Node Arbiter recovery tests - -/origin:two-node-origin-pr-helper --url --depth quick diff --git a/plugins/session/.claude-plugin/plugin.json b/plugins/session/.claude-plugin/plugin.json deleted file mode 100644 index 742a30080..000000000 --- a/plugins/session/.claude-plugin/plugin.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "name": "session", - "description": "A plugin to save and resume conversation sessions across long time intervals", - "version": "0.0.2", - "author": { - "name": "github.com/kuiwang02" - } -} diff --git a/plugins/session/README.md b/plugins/session/README.md deleted file mode 100644 index c55ac2bb2..000000000 --- a/plugins/session/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Session Plugin - -Claude Code session management and persistence utilities. - -## Commands - -### `/session:save-session` - -Save the current conversation session to a markdown file for future continuation. - -This command captures the conversation context, allowing you to resume long-running tasks across multiple sessions. - -See [commands/save-session.md](commands/save-session.md) for full documentation. - -## Installation - -```bash -/plugin install session@ai-helpers -``` - diff --git a/plugins/session/commands/save-session.md b/plugins/session/commands/save-session.md deleted file mode 100644 index f93bc8867..000000000 --- a/plugins/session/commands/save-session.md +++ /dev/null @@ -1,138 +0,0 @@ ---- -description: Save current conversation session to markdown file for future continuation -argument-hint: "[optional-description]" ---- - -## Name -session:save-session - -## Synopsis - -``` -/save-session -/save-session [description] -``` - -## Description - -Saves the current conversation session to a comprehensive markdown file that enables seamless resumption of work after extended time intervals (days, weeks, or months). - -This command addresses limitations of Claude Code's built-in session management by capturing: -- Complete conversation context and technical rationale -- Detailed file modification tracking with line numbers -- Key technical decisions and alternatives considered -- Commands executed during the session -- Clear resumption instructions - -The generated session file is designed for engineers working across multiple projects with long gaps between sessions, providing all necessary context to continue work without losing momentum. - -## Implementation - -The command follows a five-phase process: - -### Phase 0: Input Sanitization -If a description argument is provided, sanitize it for safe filename usage: -- Convert all spaces to hyphens -- Convert to lowercase -- Remove or replace special characters (keep only alphanumeric, hyphens, and underscores) -- Truncate to 100 characters maximum if longer -- Example: "investigating OCPBUGS-12345 regarding routes" → "investigating-ocpbugs-12345-regarding-routes" - -### Phase 1: Context Analysis -- Summarizes main topics and goals discussed -- Lists all accomplishments and completed tasks -- Identifies all files that were read, modified, or created -- Extracts important technical decisions and their rationale -- Captures any error messages encountered and how they were resolved -- Notes any commands that were run (make, linter, tests, etc.) - -### Phase 2: File Modification Tracking -- Reads and verifies current state of modified files -- Lists specific line numbers and code changes -- Includes before/after comparisons for critical changes -- Notes which files were created vs modified vs deleted -- Tracks any generated files (like bindata) - -### Phase 3: Session File Creation -Creates a comprehensive markdown document with these sections: - -1. **Session Summary** - Brief 1-2 paragraph overview -2. **Current State** - Status of work and modifications -3. **Accomplishments** - Detailed completion checklist -4. **Files Modified** - Organized by Created/Modified/Deleted -5. **Key Technical Decisions** - Rationale and implications -6. **Pending Tasks** - Unfinished work (checkbox format) -7. **Commands Used** - All executed commands -8. **Context for Resumption** - Critical continuation information -9. **Full Conversation Summary** - Key discussion points -10. **Next Steps** - Clear action items -11. **How to Resume This Session** - Step-by-step guide - -### Phase 4: Verification and Output -- Confirms file was created successfully -- Displays file path and size -- Provides brief summary of what was saved -- Shows resumption instructions in terminal and saved file - -## Return Value - -Creates a markdown file in the repository root directory with filename: -- `session-YYYY-MM-DD-HHMM.md` (without description) -- `session-YYYY-MM-DD-.md` (with custom description) - -Terminal output: -``` -✅ Session saved successfully! - -File: session-YYYY-MM-DD-description.md (XX KB) -Location: /full/path/to/file - -📖 To resume this session: - Please read `/full/path/to/session-YYYY-MM-DD-description.md` and continue from where we left off -``` - -## Examples - -**Basic usage with auto-generated timestamp:** -``` -/save-session -``` -Creates: `session-2025-10-16-1430.md` - -**With custom description for easy identification:** -``` -/save-session parallel-test-fixes -``` -Creates: `session-2025-10-16-parallel-test-fixes.md` - -**Multiple sessions in one project:** -``` -/save-session initial-implementation -/save-session pr-review-feedback -/save-session final-testing -``` - -**With spaces and special characters (automatically sanitized):** -``` -/save-session investigating OCPBUGS-12345 regarding routes -``` -Creates: `session-2025-10-16-investigating-ocpbugs-12345-regarding-routes.md` - -**Resuming a saved session:** -Open Claude Code and say: -``` -Please read `/path/to/session-2025-10-16-parallel-test-fixes.md` and continue from where we left off -``` - -## Arguments - -**description** (optional) -- Custom identifier appended to the filename -- Helps identify the session purpose when resuming after long intervals -- **Input handling**: Description is automatically sanitized for safe filename usage (spaces converted to hyphens, special characters removed, truncated to 100 chars if needed) -- **Good examples**: `feature-name`, `bug-fix`, `refactoring`, `investigating-ocpbugs-12345` -- Automatically added to filename: `session-YYYY-MM-DD-.md` - -If no description is provided, timestamp alone is used: `session-YYYY-MM-DD-HHMM.md` - -**Note**: You can use spaces and special characters in your description - they will be automatically sanitized. For example, "investigating OCPBUGS-12345 regarding routes" becomes "investigating-ocpbugs-12345-regarding-routes". diff --git a/plugins/utils/commands/auto-approve-konflux-prs.md b/plugins/utils/commands/auto-approve-konflux-prs.md deleted file mode 100644 index 51c44f530..000000000 --- a/plugins/utils/commands/auto-approve-konflux-prs.md +++ /dev/null @@ -1,143 +0,0 @@ ---- -description: Automate approving Konflux bot PRs for the given repository by adding /lgtm and /approve -argument-hint: ---- - -## Name - -utils:auto-approve-konflux-prs - -## Synopsis - -/utils:auto-approve-konflux-prs - -## Description - -The command automates the approval of open PRs created by the `red-hat-konflux[bot]` for the given repository. - -It filters all open PRs from the given repository, checks whether the PR already has `/lgtm` and `/approve` comments, verifies that all required checks (CI jobs or other mandatory checks) have passed, and if any labels/comments are missing and all checks succeed, posts `/lgtm` and `/approve` comments to trigger approval. - -This ensures that PRs are only auto-approved if all required checks succeed and the author is `red-hat-konflux[bot]`, reducing the risk of approving failing or unauthorized changes. - - -## Arguments - -- **$1 – target-repositories** *(required)*: GitHub repository in `OWNER/REPO` format. - - Example: openshift/multiarch-tuning-operator. - -## Implementation - -The command executes the following workflow: - -### 1. Restrict Author - -The command only processes PRs authored by `red-hat-konflux[bot]`. If a PR from any other author is encountered, it reports an error such as below: -``` -⚠️ Only PRs from red-hat-konflux[bot] can be automatically processed -``` -and exits. - -### 2. Get Open PRs - -Fetch all open PRs authored by `red-hat-konflux[bot]` for the specified repository: - -```bash -gh pr list --repo --author app/red-hat-konflux --state open --json number,title,baseRefName,labels -``` -- Extract: number,title,baseRefName,labels - -### 3. Check CI Status and Labels - -#### **For Each PR:**: - -1. Fetch detailed PR information: -```bash -gh pr view --repo --json statusCheckRollup,labels -``` -- Extract: statusCheckRollup,labels -- Handle errors: If PR is inaccessible, log warning and skip - -2. Verify all required checks: -- Verify all required checks have "conclusion": "SUCCESS" -- If any check has failed or is pending(except one pending tide job), skip adding /lgtm or /approve and log: -``` -⚠️ Skipping PR #: CI checks not all passed -``` - -3. Inspect labels: - - Check for lgtm label - - Check for approved label - -4. Add missing labels via comments: - - If /lgtm is missing, post a comment /lgtm - - If /approve is missing, post a comment /approve - - If both are missing, post a single comment containing both commands. - -5. Log each action: -``` -✅ Added /lgtm and/or /approve to PR #: (merge into ) -``` - -## Return Value - -- **Claude agent text**: Summary of processed PRs and actions taken. -- **Side effects**: - - Comments posted to PRs to trigger /lgtm and /approve. - - Progress updates for multiple PRs. - -## Examples - -1. **Process all open PRs from `red-hat-konflux[bot]` in a repository**: - - ``` - /utils:auto-approve-konflux-prs openshift/multiarch-tuning-operator - ``` - - Output: - ``` - Processing 3 open Konflux PRs... - [1/3] PR #84 - chore(deps): update konflux references (merge into main) - ✅ Added /lgtm and /approve (all CI passed) - - [2/3] PR #83 - chore(deps): update konflux references (merge into v1.x) - ⚠️ Skipping: CI checks not all passed - - [3/3] PR #82 - chore(deps): update konflux references (merge into fbc) - ✅ Added /lgtm (already had /approve, all CI passed) - - Summary: - ✅ Processed 2 PRs successfully, 1 skipped due to CI failures - ``` - -## Prerequisites - -### Required Tools - -1. **GitHub CLI (`gh`)**: Must be installed and authenticated - - Install: `brew install gh` (macOS) or see [GitHub CLI docs](https://cli.github.com/) - - Authenticate: `gh auth login` - - Verify: `gh auth status` - -2. **Access to GitHub Repositories**: Must have read access to repos where PRs are located - - PRs in private repos require appropriate GitHub permissions - - Public repos should work without additional configuration - -### Required Permissions - -1. **GitHub Permissions**: - - Read access to pull requests - - Write access to create comments on pull requests - -## Error Handling - -- **Repository inaccessible**: Reports error and exits. -- **PRs authored by someone other than `red-hat-konflux[bot]`**: Reports error and exits. -- **No open PRs from Konflux bot**: Logs "No PRs to process". -- **GitHub authentication failure**: Suggests re-login with `gh auth login`. -- **Comment posting failure**: Logs PR number and error for manual review. - -## Notes - -- The command only processes open PRs authored by `app/red-hat-konflux`. -- Compatible with repositories in which the user has write permission to post PR comments. -- Designed to minimize manual PR review effort and maintain consistent approvals. diff --git a/plugins/utils/commands/placeholder.md b/plugins/utils/commands/placeholder.md deleted file mode 100644 index aa9980dcc..000000000 --- a/plugins/utils/commands/placeholder.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -description: Placeholder command for the utils plugin ---- - -## Name -utils:placeholder - -## Synopsis -``` -/utils:placeholder -``` - -## Description -This is a placeholder command for the utils plugin. The utils plugin serves as a catch-all location for introducing new generic commands. Once enough related commands are accumulated, they can be segregated into more targeted, specialized plugins. - -This placeholder exists to maintain the plugin structure and will be replaced with actual utility commands as they are developed. - -## Implementation -The utils plugin provides a home for: -- Generic helper commands that don't fit into existing specialized plugins -- Experimental commands that may later be moved to dedicated plugins -- Common utilities that benefit multiple workflows -- Commands that are waiting to be grouped with similar functionality - -## Arguments: -None diff --git a/plugins/yaml/.claude-plugin/plugin.json b/plugins/yaml/.claude-plugin/plugin.json deleted file mode 100644 index 8dcd1826c..000000000 --- a/plugins/yaml/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "yaml", - "version": "0.0.2", - "description": "Generate comprehensive YAML documentation from Go struct definitions with sensible default values", - "author": { - "name": "saswatamcode" - }, - "license": "Apache-2.0" -} diff --git a/plugins/yaml/README.md b/plugins/yaml/README.md deleted file mode 100644 index 2b203cc54..000000000 --- a/plugins/yaml/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# YAML Plugin - -YAML documentation and utilities for Claude Code. - -## Commands - -### `/yaml:docs` - -Generate or query documentation for YAML files and structures. - -See [commands/docs.md](commands/docs.md) for full documentation. - -## Installation - -```bash -/plugin install yaml@ai-helpers -``` - diff --git a/plugins/yaml/commands/docs.md b/plugins/yaml/commands/docs.md deleted file mode 100644 index 3683dbe65..000000000 --- a/plugins/yaml/commands/docs.md +++ /dev/null @@ -1,168 +0,0 @@ ---- -description: Generate comprehensive YAML documentation from Go struct definitions with sensible default values -argument-hint: "[file:StructName] [output.md]" ---- - -## Name -yaml:docs - -## Synopsis -``` -/yaml:docs [file:StructName] [output.md] -``` - -## Description -The `yaml:docs` command generates comprehensive YAML documentation from Go struct definitions. It analyzes Go structs and produces complete, well-documented YAML configuration examples with intelligent default values for all fields. - -This command is designed to help developers quickly create YAML configuration documentation by: -- Automatically generating sensible default values for all struct fields -- Adding inline comments explaining each field's purpose and constraints -- Maintaining proper YAML formatting and structure -- Supporting nested structs, slices, maps, and complex types -- Respecting struct tags (yaml, json, validate, default) - -The spec sections is inspired by https://man7.org/linux/man-pages/man7/man-pages.7.html#top_of_page - -## Implementation - -You are a specialized tool for generating comprehensive YAML documentation from Go struct definitions. - -### Task - -Analyze the provided Go struct and generate complete YAML documentation with: -- All fields populated with intelligent, sensible default values (never leave fields empty) -- Inline comments explaining each field's purpose and constraints -- Proper YAML formatting and structure -- Nested YAML for embedded structs with all sub-fields populated - -### Input Handling - -The user may provide input in these formats: -1. `$1 $2` - File path with struct name (e.g., `pkg/api/types.go:MetricsConfig`) and optional output file path -2. `$1` - Just the file path with struct name -3. Selected code containing a Go struct definition (no arguments) - -### Instructions - -1. **Locate the struct:** - - If a file path is provided (format: `file.go:StructName`), read that file and find the specified struct - - If code is selected, use the selected Go struct definition - - Search for the struct definition and any embedded struct types - -2. **Analyze struct metadata:** - - Examine struct tags: `yaml`, `json`, `validate`, `default` - - Note validation constraints (min, max, required, etc.) - - Identify field types (strings, ints, bools, slices, maps, nested structs, pointers) - - Preserve field ordering from the struct definition - -3. **Generate intelligent defaults:** - - **Strings**: Use contextually appropriate values based on field names (e.g., "localhost" for host, "info" for log level) - - **Integers**: Use common sensible values (e.g., 8080 for port, 30 for timeout seconds) - - **Booleans**: Default to `false` unless the field name suggests otherwise - - **Durations**: Use human-readable format (e.g., "30s", "5m", "1h") - - **Slices**: Provide 1-2 example values in array format - - **Maps**: Provide 1-2 example key-value pairs - - **Nested structs**: Recursively populate all sub-fields - - **Pointers**: Treat as optional but still provide example values - -4. **Format the output:** - - Use proper YAML indentation (2 spaces) - - Add inline comments with `#` explaining each field - - Include validation constraints in comments where applicable - - Add section headers for major struct groups - - Ensure valid YAML syntax - -5. **Write the output:** - - If an output file path is provided as `$2`, use the Write tool to create that file with the generated YAML content (write pure YAML, not markdown) - - Otherwise, display the generated YAML to the user in a markdown code block with yaml syntax highlighting - -### Important Behaviors - -- **ALWAYS populate all fields** - never leave fields empty or use placeholder text -- Infer contextually appropriate defaults from field names and types -- Include helpful comments explaining what each field does -- Maintain the struct's field order in the YAML output -- Handle complex nested structures by recursively applying these rules - -## Return Value -- **Claude agent text**: Generated YAML documentation with intelligent defaults and inline comments -- **File output** (if $2 provided): YAML file written to the specified path - -## Examples - -### Example 1: Basic usage with file path and struct name -``` -/yaml:docs pkg/config/server.go:ServerConfig -``` - -Input struct: -```go -type ServerConfig struct { - Host string `yaml:"host" json:"host" validate:"required"` - Port int `yaml:"port" json:"port" validate:"min=1,max=65535"` - Timeout time.Duration `yaml:"timeout" json:"timeout"` - Debug bool `yaml:"debug" json:"debug"` - Features []string `yaml:"features" json:"features"` -} -``` - -Output: -```yaml -# Server configuration -host: "localhost" # Required: Server hostname or IP address -port: 8080 # Port number (1-65535) -timeout: "30s" # Request timeout duration -debug: false # Enable debug logging -features: ["metrics", "tracing"] # List of enabled features -``` - -### Example 2: Complex nested structs with output file -``` -/yaml:docs pkg/config/database.go:DatabaseConfig config/database.yaml -``` - -Input struct: -```go -type DatabaseConfig struct { - Host string `yaml:"host"` - Port int `yaml:"port"` - SSL SSLConfig `yaml:"ssl"` - Pools map[string]int `yaml:"pools"` - Metadata *MetadataConfig `yaml:"metadata,omitempty"` -} - -type SSLConfig struct { - Enabled bool `yaml:"enabled"` - CertFile string `yaml:"cert_file"` - KeyFile string `yaml:"key_file"` -} -``` - -Generated YAML (written to config/database.yaml): -```yaml -# Database configuration -host: "localhost" # Database host -port: 5432 # Database port -ssl: # SSL configuration - enabled: true # Enable SSL connection - cert_file: "/etc/ssl/certs/db.crt" # SSL certificate file path - key_file: "/etc/ssl/private/db.key" # SSL private key file path -pools: # Connection pools configuration - read: 10 # Read connection pool size - write: 5 # Write connection pool size -metadata: # Optional metadata configuration - cache_ttl: "1h" # Cache time-to-live - sync_interval: "5m" # Sync interval -``` - -### Example 3: Using with selected code -Select a Go struct definition in your editor, then run: -``` -/yaml:docs -``` - -The command will generate YAML documentation from the selected struct. - -## Arguments -- $1: File path and struct name in format `file.go:StructName` (e.g., `pkg/api/types.go:MetricsConfig`), or selected code containing a Go struct definition -- $2: (Optional) Output file path where the generated YAML will be written (e.g., `config/example.yaml`) From 03361d65add8b6f401e8297ebb719484c8dca159 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Tue, 5 May 2026 14:50:35 -0400 Subject: [PATCH 3/7] chore: bump versions for plugins with removed commands Bump patch versions for git (0.0.6), olm (0.1.2), openshift (0.0.6), and utils (0.0.10) since commands were removed from each. Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 8 ++++---- docs/data.json | 8 ++++---- plugins/git/.claude-plugin/plugin.json | 2 +- plugins/olm/.claude-plugin/plugin.json | 2 +- plugins/openshift/.claude-plugin/plugin.json | 2 +- plugins/utils/.claude-plugin/plugin.json | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 5e4f1d68a..f10a05784 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -8,7 +8,7 @@ "name": "git", "source": "./plugins/git", "description": "Git Plugin", - "version": "0.0.5" + "version": "0.0.6" }, { "name": "hello-world", @@ -50,13 +50,13 @@ "name": "utils", "source": "./plugins/utils", "description": "A generic utilities plugin serving as a catch-all for various helper commands", - "version": "0.0.9" + "version": "0.0.10" }, { "name": "olm", "source": "./plugins/olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.1" + "version": "0.1.2" }, { "name": "olm-team", @@ -80,7 +80,7 @@ "name": "openshift", "source": "./plugins/openshift", "description": "OpenShift development utilities and helpers", - "version": "0.0.5" + "version": "0.0.6" }, { "name": "openshift-tls-profile", diff --git a/docs/data.json b/docs/data.json index 217901c87..f37541632 100644 --- a/docs/data.json +++ b/docs/data.json @@ -58,7 +58,7 @@ "name": "Suggest Reviewers Helper" } ], - "version": "0.0.5" + "version": "0.0.6" }, { "commands": [ @@ -847,7 +847,7 @@ "hooks": [], "name": "utils", "skills": [], - "version": "0.0.9" + "version": "0.0.10" }, { "commands": [ @@ -887,7 +887,7 @@ "hooks": [], "name": "olm", "skills": [], - "version": "0.1.1" + "version": "0.1.2" }, { "commands": [ @@ -1076,7 +1076,7 @@ "name": "OpenShift Node Kernel Inspection" } ], - "version": "0.0.5" + "version": "0.0.6" }, { "commands": [ diff --git a/plugins/git/.claude-plugin/plugin.json b/plugins/git/.claude-plugin/plugin.json index 7ce0e4ad9..3219fa6d0 100644 --- a/plugins/git/.claude-plugin/plugin.json +++ b/plugins/git/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "git", "description": "Git workflow automation and utilities", - "version": "0.0.5", + "version": "0.0.6", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/olm/.claude-plugin/plugin.json b/plugins/olm/.claude-plugin/plugin.json index b6958e1ed..fb2925423 100644 --- a/plugins/olm/.claude-plugin/plugin.json +++ b/plugins/olm/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.1", + "version": "0.1.2", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/openshift/.claude-plugin/plugin.json b/plugins/openshift/.claude-plugin/plugin.json index 8252caba4..c093474c7 100644 --- a/plugins/openshift/.claude-plugin/plugin.json +++ b/plugins/openshift/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "openshift", "description": "OpenShift development utilities and helpers", - "version": "0.0.5", + "version": "0.0.6", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/utils/.claude-plugin/plugin.json b/plugins/utils/.claude-plugin/plugin.json index 424a4f989..4a311da77 100644 --- a/plugins/utils/.claude-plugin/plugin.json +++ b/plugins/utils/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "utils", "description": "A generic utilities plugin serving as a catch-all for various helper commands and agents", - "version": "0.0.9", + "version": "0.0.10", "author": { "name": "github.com/openshift-eng" } From aaf3023135198ff961b3d367f2c72ba79e56f3fc Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Tue, 5 May 2026 14:53:29 -0400 Subject: [PATCH 4/7] chore: bump bigquery version for OWNERS file addition Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 2 +- docs/data.json | 2 +- plugins/bigquery/.claude-plugin/plugin.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f10a05784..d85e949e8 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -140,7 +140,7 @@ "name": "bigquery", "source": "./plugins/bigquery", "description": "BigQuery analysis utilities", - "version": "0.0.2" + "version": "0.0.3" }, { "name": "workspaces", diff --git a/docs/data.json b/docs/data.json index f37541632..d93689f0d 100644 --- a/docs/data.json +++ b/docs/data.json @@ -1371,7 +1371,7 @@ "name": "Analyze BigQuery Usage" } ], - "version": "0.0.2" + "version": "0.0.3" }, { "commands": [ diff --git a/plugins/bigquery/.claude-plugin/plugin.json b/plugins/bigquery/.claude-plugin/plugin.json index 9f85bb859..927682916 100644 --- a/plugins/bigquery/.claude-plugin/plugin.json +++ b/plugins/bigquery/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "bigquery", "description": "BigQuery cost analysis and optimization utilities", - "version": "0.0.2", + "version": "0.0.3", "author": { "name": "github.com/openshift-eng" } From b44be27f18c6fbff589a39531cbc3ce929180802 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Tue, 5 May 2026 15:04:11 -0400 Subject: [PATCH 5/7] chore: restore saved items from pruning PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restored and added to .pruneprotect: - plugins/olm/ (all 6 commands) — saved by @stbenjam Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 2 +- .pruneprotect | 3 + PLUGINS.md | 6 + docs/data.json | 38 ++- plugins/olm/.claude-plugin/plugin.json | 2 +- plugins/olm/commands/approve.md | 305 +++++++++++++++++ plugins/olm/commands/catalog.md | 433 +++++++++++++++++++++++++ plugins/olm/commands/install.md | 272 ++++++++++++++++ plugins/olm/commands/opm.md | 359 ++++++++++++++++++++ plugins/olm/commands/uninstall.md | 392 ++++++++++++++++++++++ plugins/olm/commands/upgrade.md | 349 ++++++++++++++++++++ 11 files changed, 2158 insertions(+), 3 deletions(-) create mode 100644 plugins/olm/commands/approve.md create mode 100644 plugins/olm/commands/catalog.md create mode 100644 plugins/olm/commands/install.md create mode 100644 plugins/olm/commands/opm.md create mode 100644 plugins/olm/commands/uninstall.md create mode 100644 plugins/olm/commands/upgrade.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index d85e949e8..2e0ea0ca7 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -56,7 +56,7 @@ "name": "olm", "source": "./plugins/olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.2" + "version": "0.1.1" }, { "name": "olm-team", diff --git a/.pruneprotect b/.pruneprotect index 88f029bed..f63330f29 100644 --- a/.pruneprotect +++ b/.pruneprotect @@ -16,3 +16,6 @@ plugins/bigquery/ plugins/ci/ plugins/hcp/ plugins/sosreport/ + +# Saved by @stbenjam on 2026-05-05 +plugins/olm/ diff --git a/PLUGINS.md b/PLUGINS.md index b5464b582..6e7dafd34 100644 --- a/PLUGINS.md +++ b/PLUGINS.md @@ -189,11 +189,17 @@ See [plugins/must-gather/README.md](plugins/must-gather/README.md) for detailed OLM (Operator Lifecycle Manager) plugin for operator management and debugging **Commands:** +- **`/olm:approve` ` [namespace] [--all]`** - Approve pending InstallPlans for operator installations and upgrades +- **`/olm:catalog` ` [arguments]`** - Manage catalog sources for discovering and installing operators - **`/olm:debug` ` [olm-version]`** - Debug OLM issues using must-gather logs and source code analysis - **`/olm:diagnose` `[operator-name] [namespace] [--fix] [--cluster]`** - Diagnose and optionally fix common OLM and operator issues +- **`/olm:install` ` [namespace] [channel] [source] [--approval=Automatic|Manual]`** - Install a day-2 operator using Operator Lifecycle Manager - **`/olm:list` `[namespace] [--all-namespaces]`** - List installed operators in the cluster +- **`/olm:opm` ` [arguments...]`** - Execute opm (Operator Package Manager) commands for building and managing operator catalogs - **`/olm:search` `[query] [--catalog ]`** - Search for available operators in catalog sources - **`/olm:status` ` [namespace]`** - Get detailed status and health information for an operator +- **`/olm:uninstall` ` [namespace] [--remove-crds] [--remove-namespace]`** - Uninstall a day-2 operator and optionally remove its resources +- **`/olm:upgrade` ` [namespace] [--channel=] [--approve]`** - Update an operator to the latest version or switch channels See [plugins/olm/README.md](plugins/olm/README.md) for detailed documentation. diff --git a/docs/data.json b/docs/data.json index d93689f0d..028db3915 100644 --- a/docs/data.json +++ b/docs/data.json @@ -851,6 +851,18 @@ }, { "commands": [ + { + "argument_hint": " [namespace] [--all]", + "description": "Approve pending InstallPlans for operator installations and upgrades", + "name": "approve", + "synopsis": "/olm:approve [namespace] [--all]" + }, + { + "argument_hint": " [arguments]", + "description": "Manage catalog sources for discovering and installing operators", + "name": "catalog", + "synopsis": "/olm:catalog list" + }, { "argument_hint": " [olm-version]", "description": "Debug OLM issues using must-gather logs and source code analysis", @@ -863,12 +875,24 @@ "name": "diagnose", "synopsis": "/olm:diagnose [operator-name] [namespace] [--fix] [--cluster]" }, + { + "argument_hint": " [namespace] [channel] [source] [--approval=Automatic|Manual]", + "description": "Install a day-2 operator using Operator Lifecycle Manager", + "name": "install", + "synopsis": "/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual]" + }, { "argument_hint": "[namespace] [--all-namespaces]", "description": "List installed operators in the cluster", "name": "list", "synopsis": "/olm:list [namespace] [--all-namespaces]" }, + { + "argument_hint": " [arguments...]", + "description": "Execute opm (Operator Package Manager) commands for building and managing operator catalogs", + "name": "opm", + "synopsis": "/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=]" + }, { "argument_hint": "[query] [--catalog ]", "description": "Search for available operators in catalog sources", @@ -880,6 +904,18 @@ "description": "Get detailed status and health information for an operator", "name": "status", "synopsis": "/olm:status [namespace]" + }, + { + "argument_hint": " [namespace] [--remove-crds] [--remove-namespace]", + "description": "Uninstall a day-2 operator and optionally remove its resources", + "name": "uninstall", + "synopsis": "/olm:uninstall [namespace] [--remove-crds] [--remove-namespace]" + }, + { + "argument_hint": " [namespace] [--channel=] [--approve]", + "description": "Update an operator to the latest version or switch channels", + "name": "upgrade", + "synopsis": "/olm:upgrade [namespace] [--channel=] [--approve]" } ], "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", @@ -887,7 +923,7 @@ "hooks": [], "name": "olm", "skills": [], - "version": "0.1.2" + "version": "0.1.1" }, { "commands": [ diff --git a/plugins/olm/.claude-plugin/plugin.json b/plugins/olm/.claude-plugin/plugin.json index fb2925423..b6958e1ed 100644 --- a/plugins/olm/.claude-plugin/plugin.json +++ b/plugins/olm/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.2", + "version": "0.1.1", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/olm/commands/approve.md b/plugins/olm/commands/approve.md new file mode 100644 index 000000000..3aa5437b0 --- /dev/null +++ b/plugins/olm/commands/approve.md @@ -0,0 +1,305 @@ +--- +description: Approve pending InstallPlans for operator installations and upgrades +argument-hint: [namespace] [--all] +--- + +## Name +olm:approve + +## Synopsis +``` +/olm:approve [namespace] [--all] +``` + +## Description +The `olm:approve` command approves pending InstallPlans for operators with manual approval mode. This is required for operators that have `installPlanApproval: Manual` in their Subscription to proceed with installation or upgrades. + +This command helps you: +- Approve operator installations that are waiting for manual approval +- Approve operator upgrades +- Review what will be installed/upgraded before approval +- Batch approve multiple pending InstallPlans + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - Name of the operator + - `$2`: Namespace (optional) - Namespace where operator is installed + - If not provided, searches for the operator across all namespaces + - `$3`: Flag (optional): + - `--all`: Approve all pending InstallPlans in the namespace + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has sufficient privileges + +3. **Locate Operator**: + - If namespace provided, verify operator exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If no namespace provided, search across all namespaces: + ```bash + oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' + ``` + - If not found, display error with suggestions + +4. **Check Subscription Approval Mode**: + - Get Subscription approval mode: + ```bash + oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' + ``` + - If mode is "Automatic", display informational message: + ``` + ℹ️ Operator '{operator-name}' has automatic approval enabled. + InstallPlans are approved automatically and don't require manual intervention. + + Current Subscription approval mode: Automatic + + To switch to manual approval mode: + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"installPlanApproval":"Manual"}}' + ``` + - Exit if automatic (no approval needed) + +5. **Find Pending InstallPlans**: + - Get all InstallPlans for the operator: + ```bash + oc get installplan -n {namespace} -o json + ``` + - Filter for unapproved plans related to this operator: + ```bash + oc get installplan -n {namespace} -o json | \ + jq '.items[] | select(.spec.approved==false and .spec.clusterServiceVersionNames[] | contains("{operator-name}"))' + ``` + - If no pending InstallPlans found: + ``` + ✓ No pending InstallPlans found for operator '{operator-name}' + + The operator is up to date or already approved. + + To check operator status: /olm:status {operator-name} {namespace} + ``` + - Exit with success + +6. **Display InstallPlan Details**: + For each pending InstallPlan, display: + ``` + ⏸️ Pending InstallPlan Found + + InstallPlan: {installplan-name} + Namespace: {namespace} + Phase: {phase} + Approved: false + + ClusterServiceVersions to be installed/upgraded: + - {csv-name-1} ({version-1}) + - {csv-name-2} ({version-2}) + + Resources to be created/updated: + - CustomResourceDefinitions: {crd-count} + - ServiceAccounts: {sa-count} + - ClusterRoles: {role-count} + - Deployments: {deployment-count} + + [If upgrade:] + Current Version: {current-version} + Target Version: {target-version} + ``` + +7. **Request User Confirmation** (unless `--all` or `--force` flag): + - Display confirmation prompt: + ``` + Do you want to approve this InstallPlan? (yes/no) + ``` + - If user says no, skip this InstallPlan + - If user says yes, proceed to approval + +8. **Approve InstallPlan**: + - Patch the InstallPlan to approve it: + ```bash + oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + ``` + - Verify approval: + ```bash + oc get installplan {installplan-name} -n {namespace} -o jsonpath='{.spec.approved}' + ``` + - Display confirmation: + ``` + ✓ InstallPlan approved: {installplan-name} + ``` + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + +9. **Monitor InstallPlan Execution** (optional): + - Watch InstallPlan phase change to "Complete": + ```bash + oc get installplan {installplan-name} -n {namespace} -w --timeout=120s + ``` + - Display progress: + ``` + 🔄 InstallPlan executing... + ⏳ Installing resources... + ``` + +10. **Verify Installation/Upgrade**: + - Wait for CSV to reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -o json | \ + jq -r '.items[] | select(.status.phase=="Succeeded") | .metadata.name' + ``` + - Display result: + ``` + ✓ Operator installation/upgrade complete + + CSV: {csv-name} + Version: {version} + Phase: Succeeded + + To check operator status: /olm:status {operator-name} {namespace} + ``` + +11. **Handle Multiple InstallPlans** (if `--all` flag): + - Process all pending InstallPlans for the operator + - Display summary: + ``` + ✓ Approved {count} InstallPlan(s) + + Approved: + - {installplan-1} + - {installplan-2} + + Monitoring installation progress... + ``` + +12. **Display Approval Summary**: + ``` + ✓ Approval Complete! + + Operator: {operator-name} + Namespace: {namespace} + Approved InstallPlans: {count} + + InstallPlan Status: + - {installplan-1}: Complete + - {installplan-2}: Installing... + + Monitor progress: watch oc get csv,installplan -n {namespace} + ``` + +## Return Value +- **Success**: InstallPlan(s) approved successfully +- **No Pending Plans**: No InstallPlans require approval +- **Automatic Mode**: Operator has automatic approval (no action needed) +- **Error**: Approval failed with specific error message +- **Format**: Structured output showing: + - Approved InstallPlan names + - Installation/upgrade status + - Next steps or related commands + +## Examples + +1. **Approve pending InstallPlan for an operator**: + ``` + /olm:approve openshift-cert-manager-operator + ``` + +2. **Approve with specific namespace**: + ``` + /olm:approve external-secrets-operator eso-operator + ``` + +3. **Approve all pending InstallPlans**: + ``` + /olm:approve openshift-cert-manager-operator cert-manager-operator --all + ``` + This approves all pending InstallPlans for the operator in the namespace. + +4. **Check and approve after upgrade command**: + ``` + /olm:upgrade openshift-cert-manager-operator --channel=tech-preview + # Wait for InstallPlan to be created + /olm:approve openshift-cert-manager-operator + ``` + +## Arguments +- **$1** (operator-name): Name of the operator (required) + - Example: "openshift-cert-manager-operator" + - Must match the operator's Subscription name +- **$2** (namespace): Namespace where operator is installed (optional) + - If not provided, searches all namespaces + - Example: "cert-manager-operator" +- **$3** (flag): Optional flag + - `--all`: Approve all pending InstallPlans for this operator + - Useful when multiple upgrades are pending + - Skips individual confirmation prompts + +## Notes + +- **Manual Approval Mode**: This command only works for operators with `installPlanApproval: Manual` in their Subscription +- **Automatic Operators**: Operators with automatic approval don't need this command +- **Review Before Approval**: Always review what will be installed/upgraded before approving +- **Multiple InstallPlans**: An operator may have multiple pending InstallPlans if updates accumulated while waiting for approval +- **InstallPlan Retention**: Approved InstallPlans remain in the namespace for audit purposes + +## Troubleshooting + +- **No pending InstallPlans**: + ```bash + # List all InstallPlans + oc get installplan -n {namespace} + + # Check if operator is in automatic mode + oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' + ``` + +- **InstallPlan not executing after approval**: + ```bash + # Check InstallPlan status + oc describe installplan {installplan-name} -n {namespace} + + # Check for errors + oc get events -n {namespace} --sort-by='.lastTimestamp' | grep InstallPlan + ``` + +- **CSV not reaching Succeeded phase**: + ```bash + # Check CSV status + oc describe csv -n {namespace} + + # Check operator deployment + oc get deployments -n {namespace} + + # Check operator logs + oc logs -n {namespace} deployment/{operator-deployment} + ``` + +- **Permission denied**: + ```bash + # Check if you can patch InstallPlans + oc auth can-i patch installplan -n {namespace} + ``` + +- **Multiple namespaces found**: + - Specify the namespace explicitly in the command: + ``` + /olm:approve {operator-name} {specific-namespace} + ``` + +## Related Commands + +- `/olm:status ` - Check if InstallPlans are pending approval +- `/olm:upgrade ` - Trigger upgrade and approve in one command +- `/olm:install ` - Install operator with approval mode +- `/olm:list` - List operators and their approval modes + +## Additional Resources + +- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) +- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + + diff --git a/plugins/olm/commands/catalog.md b/plugins/olm/commands/catalog.md new file mode 100644 index 000000000..cd43964ea --- /dev/null +++ b/plugins/olm/commands/catalog.md @@ -0,0 +1,433 @@ +--- +description: Manage catalog sources for discovering and installing operators +argument-hint: [arguments] +--- + +## Name +olm:catalog + +## Synopsis +``` +/olm:catalog list +/olm:catalog add [--namespace=openshift-marketplace] +/olm:catalog remove [--namespace=openshift-marketplace] +/olm:catalog refresh [--namespace=openshift-marketplace] +/olm:catalog status [--namespace=openshift-marketplace] +``` + +## Description +The `olm:catalog` command manages catalog sources for operator discovery and installation. Catalog sources provide the list of operators available for installation in the cluster. + +This command helps you: +- List all available catalog sources and their health status +- Add custom or private catalog sources +- Remove catalog sources +- Refresh catalog sources to get latest operator updates + +## Implementation + +### Subcommand: list + +1. **Get All CatalogSources**: + ```bash + oc get catalogsource -n openshift-marketplace -o json + ``` + +2. **Parse CatalogSource Data**: + For each catalog, extract: + - Name: `.metadata.name` + - Display Name: `.spec.displayName` + - Publisher: `.spec.publisher` + - Source Type: `.spec.sourceType` (grpc, configmap, etc.) + - Image: `.spec.image` (for grpc type) + - Connection State: `.status.connectionState.lastObservedState` + - Last Updated: `.status.connectionState.lastUpdatedTime` + - Number of Operators: Count from PackageManifests with this catalog + +3. **Get Catalog Pod Status**: + ```bash + oc get pods -n openshift-marketplace -l olm.catalogSource={catalog-name} + ``` + +4. **Format Output**: + ``` + ═══════════════════════════════════════════════════════════ + CATALOG SOURCES + ═══════════════════════════════════════════════════════════ + + NAME STATUS OPERATORS LAST UPDATED SOURCE TYPE + redhat-operators READY 150 2h ago grpc + certified-operators READY 45 3h ago grpc + community-operators READY 200 1h ago grpc + redhat-marketplace READY 30 4h ago grpc + custom-catalog FAILED 0 - grpc + + ═══════════════════════════════════════════════════════════ + DETAILS + ═══════════════════════════════════════════════════════════ + + redhat-operators: + Display Name: Red Hat Operators + Publisher: Red Hat + Image: registry.redhat.io/redhat/redhat-operator-index:v4.20 + Pod: redhat-operators-abc123 (Running) + + custom-catalog (FAILED): + Display Name: Custom Catalog + Publisher: My Company + Image: registry.example.com/custom-catalog:latest + Pod: custom-catalog-xyz789 (CrashLoopBackOff) + Error: ImagePullBackOff + + To troubleshoot: + /olm:catalog status custom-catalog + ``` + +### Subcommand: add + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `image`: Catalog image (required) + - `--namespace`: Target namespace (default: openshift-marketplace) + - `--display-name`: Display name (optional) + - `--publisher`: Publisher name (optional) + +2. **Validate Image**: + - Check if image format is valid + - Optionally test image accessibility (if possible) + +3. **Create CatalogSource Manifest**: + ```yaml + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: {name} + namespace: {namespace} + spec: + sourceType: grpc + image: {image} + displayName: {display-name} + publisher: {publisher} + updateStrategy: + registryPoll: + interval: 30m + ``` + +4. **Apply CatalogSource**: + ```bash + oc apply -f /tmp/catalogsource-{name}.yaml + ``` + +5. **Wait for CatalogSource to be Ready**: + ```bash + oc wait --for=condition=READY catalogsource/{name} -n {namespace} --timeout=300s + ``` + +6. **Verify Pod is Running**: + ```bash + oc get pods -n {namespace} -l olm.catalogSource={name} + ``` + +7. **Display Result**: + ``` + ✓ Catalog source added: {name} + + Name: {name} + Namespace: {namespace} + Image: {image} + Status: READY + Pod: {pod-name} (Running) + + To search operators: /olm:search --catalog {name} + ``` + +### Subcommand: remove + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Check if CatalogSource Exists**: + ```bash + oc get catalogsource {name} -n {namespace} --ignore-not-found + ``` + +3. **Check for Operators Using This Catalog**: + ```bash + oc get subscription --all-namespaces -o json | \ + jq -r '.items[] | select(.spec.source=="{name}") | "\(.metadata.namespace)/\(.metadata.name)"' + ``` + +4. **Display Warning** (if operators found): + ``` + WARNING: The following operators are using this catalog: + - namespace-1/operator-1 + - namespace-2/operator-2 + + Removing this catalog will prevent these operators from receiving updates. + + Do you want to continue? (yes/no) + ``` + +5. **Delete CatalogSource**: + ```bash + oc delete catalogsource {name} -n {namespace} + ``` + +6. **Wait for Pod to be Deleted**: + ```bash + oc wait --for=delete pod -l olm.catalogSource={name} -n {namespace} --timeout=60s + ``` + +7. **Display Result**: + ``` + ✓ Catalog source removed: {name} + ``` + +### Subcommand: refresh + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Get Current CatalogSource**: + ```bash + oc get catalogsource {name} -n {namespace} -o json + ``` + +3. **Trigger Refresh by Deleting Pod**: + ```bash + oc delete pod -n {namespace} -l olm.catalogSource={name} + ``` + - This forces OLM to recreate the pod and re-fetch catalog data + +4. **Wait for New Pod to be Ready**: + ```bash + oc wait --for=condition=Ready pod -l olm.catalogSource={name} -n {namespace} --timeout=300s + ``` + +5. **Verify Catalog is Updated**: + ```bash + oc get catalogsource {name} -n {namespace} -o json | \ + jq -r '.status.connectionState.lastUpdatedTime' + ``` + +6. **Display Result**: + ``` + ✓ Catalog source refreshed: {name} + + Last Updated: {timestamp} + Status: READY + Pod: {pod-name} (Running) + + New operators may now be available: /olm:search --catalog {name} + ``` + +### Subcommand: status + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Get CatalogSource Details**: + ```bash + oc get catalogsource {name} -n {namespace} -o json + ``` + +3. **Get Pod Details**: + ```bash + oc get pods -n {namespace} -l olm.catalogSource={name} -o json + ``` + +4. **Get Recent Events**: + ```bash + oc get events -n {namespace} --field-selector involvedObject.name={name} --sort-by='.lastTimestamp' + ``` + +5. **Count Available Operators**: + ```bash + oc get packagemanifests -n openshift-marketplace -o json | \ + jq -r '.items[] | select(.status.catalogSource=="{name}") | .metadata.name' | wc -l + ``` + +6. **Verify Catalog Connectivity**: + - Check if catalog is serving content by verifying PackageManifest count > 0 + - If count is 0 but pod is Running, indicates connectivity or catalog index issues + - Review catalog pod logs for gRPC errors, image pull issues, or index corruption: + ```bash + oc logs -n {namespace} {catalog-pod-name} + ``` + +7. **Format Comprehensive Status Report**: + ``` + ═══════════════════════════════════════════════════════════ + CATALOG SOURCE STATUS: {name} + ═══════════════════════════════════════════════════════════ + + General Information: + Name: {name} + Namespace: {namespace} + Display Name: {display-name} + Publisher: {publisher} + Source Type: {source-type} + Image: {image} + + Connection Status: + State: {state} (READY | CONNECTING | CONNECTION_FAILED) + Last Updated: {timestamp} + Last Successful: {timestamp} + + Pod Status: + Name: {pod-name} + Status: {status} (Running | CrashLoopBackOff | ImagePullBackOff) + Ready: {ready-containers}/{total-containers} + Restarts: {restart-count} + Age: {age} + + Catalog Content: + Operators Available: {count} + + [If issues detected:] + ⚠️ Issues Detected: + - Pod in CrashLoopBackOff + - Last update: 24h ago (stale) + - Connection state: CONNECTION_FAILED + + Recent Events: + {timestamp} Warning: Failed to pull image + {timestamp} Warning: Back-off restarting failed container + + Troubleshooting Steps: + 1. Check pod logs: oc logs -n {namespace} {pod-name} + 2. Check image accessibility + 3. Refresh catalog: /olm:catalog refresh {name} + 4. Verify network connectivity (for disconnected environments) + + Related Commands: + - Refresh: /olm:catalog refresh {name} + - List operators: /olm:search --catalog {name} + ``` + +## Return Value +- **list**: Table of all catalog sources with status +- **add**: Confirmation of added catalog with details +- **remove**: Confirmation of removed catalog +- **refresh**: Confirmation of refresh with updated timestamp +- **status**: Comprehensive status report for specific catalog + +## Examples + +1. **List all catalog sources**: + ``` + /olm:catalog list + ``` + +2. **Add custom catalog**: + ``` + /olm:catalog add my-catalog registry.example.com/my-catalog:v1.0 + ``` + +3. **Add catalog with metadata**: + ``` + /olm:catalog add my-catalog registry.example.com/catalog:latest \ + --display-name="My Custom Catalog" \ + --publisher="My Company" + ``` + +4. **Remove catalog**: + ``` + /olm:catalog remove my-catalog + ``` + +5. **Refresh catalog to get latest operators**: + ``` + /olm:catalog refresh redhat-operators + ``` + +6. **Check catalog health**: + ``` + /olm:catalog status custom-catalog + ``` + +7. **Add catalog for disconnected environment**: + ``` + /olm:catalog add disconnected-operators \ + mirror-registry.local:5000/olm/redhat-operators:v4.20 \ + --namespace=openshift-marketplace + ``` + +## Arguments + +### list +No arguments required. + +### add +- **name** (required): Name for the catalog source +- **image** (required): Container image containing the catalog +- **--namespace**: Target namespace (default: openshift-marketplace) +- **--display-name**: Human-readable display name +- **--publisher**: Publisher/organization name + +### remove +- **name** (required): Name of the catalog source to remove +- **--namespace**: Namespace (default: openshift-marketplace) + +### refresh +- **name** (required): Name of the catalog source to refresh +- **--namespace**: Namespace (default: openshift-marketplace) + +### status +- **name** (required): Name of the catalog source to check +- **--namespace**: Namespace (default: openshift-marketplace) + +## Troubleshooting + +- **Catalog pod failing**: + ```bash + # Check pod logs + oc logs -n openshift-marketplace {catalog-pod-name} + + # Check image pull issues + oc describe pod -n openshift-marketplace {catalog-pod-name} + ``` + +- **No operators showing up**: + ```bash + # Verify catalog is ready + /olm:catalog status {catalog-name} + + # Check PackageManifests + oc get packagemanifests -n openshift-marketplace + ``` + +- **Image pull errors (disconnected environment)**: + - Verify image registry is accessible + - Check pull secrets are configured + - Ensure image has been mirrored correctly + +- **Stale catalog data**: + ```bash + # Force refresh + /olm:catalog refresh {catalog-name} + ``` + +- **Connection failures**: + ```bash + # Check catalog source definition + oc get catalogsource {catalog-name} -n openshift-marketplace -o yaml + + # Run cluster diagnostics + /olm:diagnose --cluster + ``` + +## Related Commands + +- `/olm:search` - Search for operators in catalogs +- `/olm:install` - Install operators from catalogs +- `/olm:diagnose` - Diagnose catalog health issues + +## Additional Resources +- [Building Catalog Images with opm](https://olm.operatorframework.io/docs/tasks/creating-catalog-from-index/) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + + diff --git a/plugins/olm/commands/install.md b/plugins/olm/commands/install.md new file mode 100644 index 000000000..ccc0bcfc2 --- /dev/null +++ b/plugins/olm/commands/install.md @@ -0,0 +1,272 @@ +--- +description: Install a day-2 operator using Operator Lifecycle Manager +argument-hint: [namespace] [channel] [source] [--approval=Automatic|Manual] +--- + +## Name +olm:install + +## Synopsis +``` +/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual] +``` + +## Description +The `olm:install` command installs a day-2 operator in an OpenShift cluster using Operator Lifecycle Manager (OLM). It automates the creation of the required namespace, OperatorGroup, and Subscription resources needed to install an operator. + +This command handles the complete operator installation workflow: +- Creates or verifies the target namespace exists +- Creates an OperatorGroup if needed +- Creates a Subscription to install the operator +- Verifies the installation by checking the operator's CSV (ClusterServiceVersion) status +- Provides detailed feedback on the installation progress + +The command is designed to work with operators from the OperatorHub catalog, including Red Hat certified operators, community operators, and custom catalog sources. + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - The name of the operator to install (e.g., "openshift-cert-manager-operator") + - `$2`: Namespace (optional) - Target namespace for the operator. If not provided, defaults to `{operator-name}-operator` (e.g., "cert-manager-operator") + - `$3`: Channel (optional) - Subscription channel. If not provided, discovers the default channel from the operator's PackageManifest + - `$4`: Source (optional) - CatalogSource name. Defaults to "redhat-operators" for Red Hat operators + - `$5+`: Flags (optional): + - `--approval=Automatic|Manual`: InstallPlan approval mode (default: Automatic) + - Automatic: Operator upgrades are automatically installed + - Manual: Operator upgrades require manual approval via `/olm:approve` or `oc patch` + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has cluster-admin or sufficient privileges + - If not installed or not authenticated, provide clear instructions + +3. **Discover Operator Metadata** (if channel or source not provided): + - Search for the operator in available catalogs: + ```bash + oc get packagemanifests -n openshift-marketplace | grep {operator-name} + ``` + - Get the PackageManifest details: + ```bash + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + - Extract: + - Default channel: `.status.defaultChannel` + - CatalogSource: `.status.catalogSource` + - CatalogSourceNamespace: `.status.catalogSourceNamespace` + - If operator not found, provide error with list of available operators + +4. **Create Namespace**: + - Check if namespace exists: `oc get namespace {namespace} --ignore-not-found` + - If not exists, create it: + ```bash + oc create namespace {namespace} + ``` + - If exists, inform user and continue + +5. **Create OperatorGroup**: + - Check if OperatorGroup exists in the namespace: + ```bash + oc get operatorgroup -n {namespace} --ignore-not-found + ``` + - If no OperatorGroup exists, create one: + ```yaml + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: {namespace}-operatorgroup + namespace: {namespace} + spec: + targetNamespaces: + - {namespace} + ``` + - Save to temporary file and apply: + ```bash + oc apply -f /tmp/operatorgroup-{operator-name}.yaml + ``` + - If OperatorGroup already exists, inform user and continue + +6. **Create Subscription**: + - Parse approval mode from flags (default: Automatic) + - Create Subscription manifest: + ```yaml + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: {operator-name} + namespace: {namespace} + spec: + channel: {channel} + name: {operator-name} + source: {source} + sourceNamespace: openshift-marketplace + installPlanApproval: {Automatic|Manual} + ``` + - Save to temporary file and apply: + ```bash + oc apply -f /tmp/subscription-{operator-name}.yaml + ``` + - Display the created subscription details + - If approval mode is Manual, display informational message: + ``` + ℹ️ InstallPlan approval set to Manual + You will need to manually approve InstallPlans for this operator. + Use: /olm:approve {operator-name} {namespace} + + Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + ``` + +7. **Verify Installation**: + - Wait for InstallPlan to be created: + ```bash + oc get installplan -n {namespace} -l operators.coreos.com/operator={operator-name} + ``` + - If approval mode is Manual, check if InstallPlan needs approval: + ```bash + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + ``` + - If Manual and not approved, display message: + ``` + ⏸️ InstallPlan created but requires manual approval + + InstallPlan: {installplan-name} + To approve: /olm:approve {operator-name} {namespace} + Or manually: oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + + Waiting for approval... + ``` + - Wait for CSV to be created and reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -w + ``` + - Use a timeout of 5 minutes for the installation to complete (10 minutes if Manual approval) + - Poll every 10 seconds to check CSV status + - Display progress updates to the user + +8. **Display Results**: + - Show the installed operator's CSV name and version + - Show the operator deployment status: + ```bash + oc get deployments -n {namespace} + ``` + - List any pods created by the operator: + ```bash + oc get pods -n {namespace} + ``` + - Display success message with next steps or usage instructions + +9. **Cleanup Temporary Files**: + - Remove temporary YAML files created during installation: + ```bash + rm -f /tmp/operatorgroup-{operator-name}.yaml /tmp/subscription-{operator-name}.yaml + ``` + +## Return Value +- **Success**: Operator installed successfully with details about the CSV, deployments, and pods +- **Error**: Installation failed with specific error message and troubleshooting suggestions +- **Format**: Structured output showing: + - Namespace created/used + - OperatorGroup status + - Subscription created + - CSV status and version + - Deployment and pod status + +## Examples + +1. **Install cert-manager-operator with defaults**: + ``` + /olm:install openshift-cert-manager-operator + ``` + This will: + - Create namespace `cert-manager-operator` + - Discover default channel from PackageManifest + - Use `redhat-operators` catalog source + - Install the operator + +2. **Install cert-manager-operator with custom namespace**: + ``` + /olm:install openshift-cert-manager-operator my-cert-manager + ``` + This will install the operator in the `my-cert-manager` namespace. + +3. **Install with specific channel**: + ``` + /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 + ``` + This will install from the `stable-v1` channel. + +4. **Install from community catalog**: + ``` + /olm:install prometheus community-operators stable community-operators + ``` + This will install Prometheus from the community-operators catalog. + +5. **Install Red Hat Advanced Cluster Security**: + ``` + /olm:install rhacs-operator rhacs-operator stable + ``` + +6. **Install with manual approval mode**: + ``` + /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 redhat-operators --approval=Manual + ``` + This will install the operator but require manual approval for all upgrades. + +7. **Install with all parameters specified**: + ``` + /olm:install external-secrets-operator eso-operator stable-v0.10 redhat-operators --approval=Automatic + ``` + +## Arguments +- **$1** (operator-name): The name of the operator to install (required) + - Example: "openshift-cert-manager-operator" + - Must match the name in the operator's PackageManifest +- **$2** (namespace): Target namespace for the operator installation (optional) + - Default: `{operator-name}` (operator name without "openshift-" prefix if present) + - Example: "cert-manager-operator" +- **$3** (channel): Subscription channel (optional) + - Default: Auto-discovered from PackageManifest's default channel + - Example: "stable-v1", "tech-preview", "stable" +- **$4** (source): CatalogSource name (optional) + - Default: "redhat-operators" + - Other options: "certified-operators", "community-operators", "redhat-marketplace" +- **$5+** (flags): Optional flags + - `--approval=Automatic|Manual`: InstallPlan approval mode + - **Automatic** (default): Operator upgrades are automatically installed without user intervention + - **Manual**: Operator upgrades require explicit approval. Useful for: + - Production environments requiring change control + - Testing upgrades before applying + - Preventing unexpected operator updates + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + +## Notes + +- **Automatic Channel Discovery**: If no channel is specified, the command automatically discovers and uses the operator's default channel from its PackageManifest +- **Namespace Convention**: By default, operators are installed in a namespace following the pattern `{operator-name}-operator` +- **OperatorGroup Scope**: The created OperatorGroup targets only the installation namespace for better isolation +- **InstallPlan Approval**: Set to "Automatic" by default for seamless installation. Can be changed to "Manual" using `--approval=Manual` flag +- **Manual Approval Mode**: When using `--approval=Manual`: + - Initial installation may require manual approval of the InstallPlan + - All future upgrades will require explicit approval via `/olm:approve` command + - Provides better control over operator updates in production environments +- **Verification Timeout**: The command waits up to 5 minutes for the operator to install successfully (10 minutes for manual approval mode) +- **Cleanup**: Temporary YAML files are automatically removed after installation + +## Troubleshooting + +- **Operator not found**: Run `oc get packagemanifests -n openshift-marketplace` to see available operators +- **Permission denied**: Ensure you have cluster-admin privileges or the necessary RBAC permissions +- **Installation timeout**: Check the InstallPlan and CSV status manually: + ```bash + oc get installplan -n {namespace} + oc get csv -n {namespace} + oc describe csv -n {namespace} + ``` +- **Operator pod not starting**: Check pod logs: + ```bash + oc logs -n {namespace} deployment/{operator-deployment} + ``` + diff --git a/plugins/olm/commands/opm.md b/plugins/olm/commands/opm.md new file mode 100644 index 000000000..bd30fe0de --- /dev/null +++ b/plugins/olm/commands/opm.md @@ -0,0 +1,359 @@ +--- +description: Execute opm (Operator Package Manager) commands for building and managing operator catalogs +argument-hint: [arguments...] +--- + +## Name +olm:opm + +## Synopsis +```bash +/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] +/olm:opm list packages +/olm:opm list channels [package-name] +/olm:opm list bundles [package-name] +``` + +## Description +The `olm:opm` command provides a unified interface to `opm` (Operator Package Manager) operations for building and managing operator catalog indexes. It supports building catalog indexes, generating semver templates, and querying catalog contents. + +## Arguments +- `$1`: **action** - The action to perform: + - `build-index-image`: Build an index from an existing catalog directory + - `build-semver-index-image`: Build an index from a semver template + - `generate-semver-template`: Generate a semver template file + - `list`: List catalog contents (requires second argument: `packages`, `channels`, or `bundles`) +- `$2+`: Additional arguments specific to each action (see Actions section below) + +## Actions + +### build-index-image +Build an operator catalog index image from an existing catalog directory. + +**Synopsis:** +```bash +/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +``` + +**Arguments:** +- `$2`: **catalog-path** - Path to the catalog directory containing the index configuration +- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) +- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) +- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) +- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) +- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) + +**Examples:** +```bash +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --cacheless +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 +``` + +### build-semver-index-image +Build a multi-architecture operator catalog index image using the semver template format. + +**Synopsis:** +```bash +/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +``` + +**Arguments:** +- `$2`: **semver-template-file** - Path to the semver template configuration file (e.g., `catalog-config.yaml`) +- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) +- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) +- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) +- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) +- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) + +**Examples:** +```bash +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --cacheless +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=multi +``` + +### generate-semver-template +Generate a semver template configuration file for building operator catalogs. + +**Synopsis:** +```bash +/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] +``` + +**Arguments:** +- `$2`: **bundle-list** - Comma-separated list of bundle image references (e.g., `quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1`) +- `--output=`: Optional output file path (default: `catalog-semver-config.yaml` in current directory) +- `--major=true|false`: Optional flag to generate major version channels (default: `true`) +- `--minor=true|false`: Optional flag to generate minor version channels (default: `false`) + +**Examples:** +```bash +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 --output=my-catalog.yaml +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.1.0 --minor=true +``` + +### list packages +List all operator packages available in a catalog index. + +**Synopsis:** +```bash +/olm:opm list packages +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **packages** - Must be "packages" +- `$4`: **index-ref** - Catalog index reference, either: + - Image tag: `quay.io/myorg/mycatalog:v1.0.0` + - Directory path: `./catalog` or `/path/to/catalog` + +**Examples:** +```bash +/olm:opm list packages quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list packages ./catalog +``` + +### list channels +List channels for operator packages in a catalog index. + +**Synopsis:** +```bash +/olm:opm list channels [package-name] +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **channels** - Must be "channels" +- `$4`: **index-ref** - Catalog index reference (image tag or directory path) +- `$5`: **package-name** (Optional) - Name of a specific package to list channels for + +**Examples:** +```bash +/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 nginx85187 +/olm:opm list channels ./catalog +``` + +### list bundles +List bundles for operator packages in a catalog index. + +**Synopsis:** +```bash +/olm:opm list bundles [package-name] +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **bundles** - Must be "bundles" +- `$4`: **index-ref** - Catalog index reference (image tag or directory path) +- `$5`: **package-name** (Optional) - Name of a specific package to list bundles for + +**Examples:** +```bash +/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 nginx85187 +/olm:opm list bundles ./catalog +``` + +## Implementation + +### Step 1: Parse Action +- Extract the action from `$1` +- Validate the action is one of: `build-index-image`, `build-semver-index-image`, `generate-semver-template`, `list` +- If invalid action, display error with available actions + +### Step 2: Check Prerequisites +Verify required tools are installed: +- Check for `opm`: `which opm` + - If not found, provide installation instructions: +- For build actions, also check for `podman`: `which podman` + - If not found, provide installation instructions based on user's platform + +### Step 3: Route to Action Handler +Based on the action, call the appropriate implementation: + +#### For `build-index-image`: +1. **Parse Arguments and Set Defaults** + - Extract catalog path from `$2` + - Extract index image tag from `$3` + - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` + - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` + +2. **Verify Catalog Directory** + - Check catalog directory exists: `test -d ` + +3. **Validate Catalog** + ```bash + opm validate + ``` + +4. **Generate Dockerfile** + - If cacheless: `opm generate dockerfile --base-image=scratch` + - If normal: `opm generate dockerfile -b -i ` + +5. **Determine Build Platform** + - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` + - Otherwise: `linux/` + +6. **Create Podman Manifest** + ```bash + podman manifest rm 2>/dev/null || true + podman manifest create + ``` + +7. **Build Image** + ```bash + podman build --platform --manifest . -f catalog.Dockerfile + ``` + +8. **Push Manifest** + ```bash + podman manifest push + ``` + +9. **List Bundles in Index** + ```bash + opm alpha list bundles + ``` + +10. **Display Success Message** + +#### For `build-semver-index-image`: +1. **Parse Arguments and Set Defaults** + - Extract semver template file from `$2` + - Extract index image tag from `$3` + - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` + - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` + +2. **Verify Template File** + - Check file exists: `test -f ` + +3. **Create Catalog and Render Template** + ```bash + mkdir -p catalog + opm alpha render-template semver -o yaml > catalog/index.yaml + ``` + +4. **Validate Catalog** + ```bash + opm validate catalog + ``` + +5. **Generate Dockerfile** + - If cacheless: `opm generate dockerfile catalog --base-image=scratch` + - If normal: `opm generate dockerfile catalog -b -i ` + +6. **Determine Build Platform** + - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` + - Otherwise: `linux/` + +7. **Create Podman Manifest** + ```bash + podman manifest rm 2>/dev/null || true + podman manifest create + ``` + +8. **Build Image** + ```bash + podman build --platform --manifest . -f catalog.Dockerfile + ``` + +9. **Push Manifest** + ```bash + podman manifest push + ``` + +10. **List Bundles in Index** + ```bash + opm alpha list bundles + ``` + +11. **Display Success Message** + +#### For `generate-semver-template`: +1. **Parse Arguments and Set Defaults** + - Extract bundle list from `$2` + - Parse optional flags: `--output`, `--major`, `--minor` + - Set defaults: output=`catalog-semver-config.yaml`, major=`true`, minor=`false` + +2. **Validate Bundle List** + - Split by commas + - Validate each bundle is a valid image reference + +3. **Generate YAML Content** + ```yaml + Schema: olm.semver + GenerateMajorChannels: + GenerateMinorChannels: + Candidate: + Bundles: + - Image: + - Image: + ``` + +4. **Write Template File** + - Check if file exists and confirm overwrite if needed + - Write YAML content + +5. **Validate Generated File** + - Read back and verify YAML is well-formed + +6. **Display Success Message** + - Show file path, bundles included, settings + - Suggest next step: `/olm:opm build-semver-index-image ` + +#### For `list`: +1. **Parse List Type** + - Extract list type from `$2` (must be `packages`, `channels`, or `bundles`) + - If invalid, display error with available types + +2. **Parse Index Reference and Optional Package** + - Extract index-ref from `$3` + - Extract optional package-name from `$4` (for channels and bundles) + +3. **Determine Reference Type** + - Check if directory: `test -d ` + +4. **Execute List Command** + - For packages: `opm alpha list packages ` + - For channels: `opm alpha list channels [package-name]` + - For bundles: `opm alpha list bundles [package-name]` + +5. **Display Results** + - Show the output with appropriate formatting + - Display count of items found + +## Return Value + +**Format**: Varies by action + +- **build-index-image / build-semver-index-image**: Success message with image tag, architectures, and bundle list +- **generate-semver-template**: Success message with file path and configuration details +- **list**: Table or list of catalog contents + +On failure, displays: +- Clear error message indicating which step/action failed +- Relevant tool output for debugging +- Suggestions for resolution + +## Notes + +- Ensure you are authenticated to container registries before building/pushing images (use `podman login`) +- For build operations, the `catalog.Dockerfile` is created in the current working directory +- Multi-architecture builds can be time-consuming +- Cacheless builds result in smaller images and use `scratch` as the base image +- When using `--cacheless`, the `--base-image` and `--builder-image` options are ignored (scratch is always used as base) +- Index references can be either image tags or local directory paths +- Bundle images must be accessible from where you build the catalog +- Image tags should include the full registry hostname (e.g., `quay.io/org/image:tag` not `quay/org/image:tag`) + +## Related Commands + +- `/olm:install` - Install an operator using OLM +- `/olm:catalog` - Manage catalog sources +- `/olm:debug` - Debug OLM issues diff --git a/plugins/olm/commands/uninstall.md b/plugins/olm/commands/uninstall.md new file mode 100644 index 000000000..36c3ec14c --- /dev/null +++ b/plugins/olm/commands/uninstall.md @@ -0,0 +1,392 @@ +--- +description: Uninstall a day-2 operator and optionally remove its resources +argument-hint: [namespace] [--remove-crds] [--remove-namespace] +--- + +## Name +olm:uninstall + +## Synopsis +``` +/olm:uninstall [namespace] [--remove-crds] [--remove-namespace] +``` + +## Description +The `olm:uninstall` command uninstalls a day-2 operator from an OpenShift cluster by removing its Subscription, ClusterServiceVersion (CSV), and optionally its Custom Resource Definitions (CRDs) and namespace. + +This command provides a comprehensive uninstallation workflow: +- Removes the operator's Subscription +- Deletes the ClusterServiceVersion (CSV) +- Optionally removes operator-managed deployments +- Optionally deletes Custom Resource Definitions (CRDs) +- Optionally removes the operator's namespace +- Provides detailed feedback on each step + +The command is designed to safely clean up operators installed via OLM, with optional flags for thorough cleanup of all operator-related resources. + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - The name of the operator to uninstall + - `$2`: Namespace (optional) - The namespace where operator is installed. If not provided, defaults to `{operator-name}-operator` + - `$3+`: Flags (optional): + - `--remove-crds`: Remove Custom Resource Definitions after uninstalling + - `--remove-namespace`: Remove the operator's namespace after cleanup + - `--force`: Skip confirmation prompts + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has cluster-admin or sufficient privileges + +3. **Verify Operator Installation**: + - Check if namespace exists: + ```bash + oc get namespace {namespace} --ignore-not-found + ``` + - Check if subscription exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If not found, display error: "Operator {operator-name} is not installed in namespace {namespace}" + - List what will be uninstalled + +4. **Display Uninstallation Plan**: + - Show operator details: + ```bash + oc get subscription {operator-name} -n {namespace} -o yaml + oc get csv -n {namespace} + ``` + - Display what will be removed: + - Subscription name and namespace + - CSV name and version + - Deployments (if any) + - CRDs (if `--remove-crds` flag is set) + - Namespace (if `--remove-namespace` flag is set) + +5. **Request User Confirmation** (unless `--force` flag is set): + - Display warning: + ``` + WARNING: You are about to uninstall {operator-name} from namespace {namespace}. + This will remove: + - Subscription: {subscription-name} + - ClusterServiceVersion: {csv-name} + - Operator deployments + [- Custom Resource Definitions (if --remove-crds is set)] + [- Namespace {namespace} (if --remove-namespace is set)] + + Are you sure you want to continue? (yes/no) + ``` + - Wait for user confirmation + - If user says no, abort operation + +6. **Delete Subscription**: + - Remove the operator's subscription: + ```bash + oc delete subscription {operator-name} -n {namespace} + ``` + - Verify deletion: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - Display result + +7. **Delete ClusterServiceVersion (CSV)**: + - Get the CSV name: + ```bash + oc get csv -n {namespace} -o jsonpath='{.items[?(@.spec.displayName contains "{operator-name}")].metadata.name}' + ``` + - Delete the CSV: + ```bash + oc delete csv {csv-name} -n {namespace} + ``` + - This will automatically remove operator deployments + - Verify CSV is deleted: + ```bash + oc get csv -n {namespace} --ignore-not-found + ``` + +8. **Remove Operator Deployments** (if still present): + - List deployments created by the operator: + ```bash + oc get deployments -n {namespace} + ``` + - For operators like cert-manager with labeled resources: + ```bash + oc delete deployment -n {namespace} -l app.kubernetes.io/instance={operator-base-name} + ``` + - Verify deployments are deleted: + ```bash + oc get deployments -n {namespace} + ``` + +8.5. **Check for Orphaned Custom Resources** (before removing CRDs): + - Get list of CRDs managed by the operator from CSV: + ```bash + oc get csv -n {namespace} -o jsonpath='{.items[0].spec.customresourcedefinitions.owned[*].name}' + ``` + - For each CRD, search for CR instances across all namespaces: + ```bash + oc get --all-namespaces --ignore-not-found + ``` + - If CRs exist, list them with details: + ``` + WARNING: Found custom resources that may prevent clean uninstallation: + - namespace-1/ (kind: ) + - namespace-2/ (kind: ) + + These resources should be deleted before uninstalling the operator. + Do you want to delete these custom resources? (yes/no) + ``` + - If user confirms, delete each CR: + ```bash + oc delete -n + ``` + - This prevents namespace from getting stuck in Terminating state + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + +9. **Remove Custom Resource Definitions** (if `--remove-crds` flag is set): + - **WARNING**: Display critical warning to user: + ``` + WARNING: Removing CRDs will delete ALL custom resources of these types across the entire cluster! + This action is irreversible and will affect all namespaces. + + Are you absolutely sure you want to remove CRDs? (yes/no) + ``` + - If user confirms, proceed with CRD removal + - Get list of CRDs owned by the operator: + ```bash + oc get csv {csv-name} -n {namespace} -o jsonpath='{.spec.customresourcedefinitions.owned[*].name}' + ``` + - For each CRD, check if custom resources exist: + ```bash + oc get {crd-name} --all-namespaces --ignore-not-found + ``` + - Display warning if custom resources exist + - Delete CRDs: + ```bash + oc delete crd {crd-name} + ``` + +10. **Remove Namespace** (if `--remove-namespace` flag is set): + - **WARNING**: Display warning: + ``` + WARNING: Removing namespace {namespace} will delete all resources in this namespace! + + Are you sure you want to remove namespace {namespace}? (yes/no) + ``` + - If user confirms: + ```bash + oc delete namespace {namespace} + ``` + - Monitor namespace deletion with timeout: + ```bash + oc wait --for=delete namespace/{namespace} --timeout=120s + ``` + - If namespace gets stuck in "Terminating" state after 120 seconds: + - Check for resources preventing deletion: + ```bash + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + ``` + - Check for finalizers on the namespace: + ```bash + oc get namespace {namespace} -o jsonpath='{.metadata.finalizers}' + ``` + - Display helpful error message: + ``` + ERROR: Namespace {namespace} is stuck in Terminating state. + + Possible causes: + - Resources with finalizers preventing deletion + - API services that are unavailable + - Custom resources that cannot be deleted + + To diagnose and fix, run: /olm:diagnose {operator-name} {namespace} + + Manual troubleshooting: + 1. Check remaining resources: + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + + 2. Check namespace finalizers: + oc get namespace {namespace} -o yaml | grep -A5 finalizers + + WARNING: Do NOT force-delete the namespace as it can lead to unstable cluster behavior. + See: https://access.redhat.com/solutions/4165791 + ``` + - Exit with error code + - Note: OperatorGroup will be automatically deleted with the namespace + +11. **Post-Uninstall Verification**: + - Verify all resources are cleaned up: + ```bash + oc get subscription,csv,installplan -n {namespace} --ignore-not-found + ``` + - Check if any CRDs remain (if they were supposed to be deleted): + ```bash + oc get crd | grep + ``` + - If uninstalling without `--remove-namespace`, check namespace is clean: + ```bash + oc get all -n {namespace} + ``` + - Display any remaining resources with suggestions for cleanup + +12. **Display Uninstallation Summary**: + - Show what was successfully removed: + ``` + ✓ Uninstallation Summary: + ✓ Subscription '{operator-name}' deleted + ✓ CSV '{csv-name}' deleted + ✓ Operator deployments removed + [✓ X custom resources deleted] + [✓ Y CRDs removed] + [✓ Namespace '{namespace}' deleted] + ``` + - If CRDs or namespace were NOT removed, provide instructions: + ``` + Note: The following resources were NOT removed: + - Custom Resource Definitions (use --remove-crds to remove) + - Namespace {namespace} (use --remove-namespace to remove) + + To completely remove all operator resources, run: + /olm:uninstall {operator-name} {namespace} --remove-crds --remove-namespace + ``` + - **Important warning about reinstallation**: + ``` + IMPORTANT: Before reinstalling this operator, verify all resources are cleaned: + + oc get subscription,csv,installplan -n {namespace} + oc get crd | grep + + Failure to completely uninstall may cause reinstallation issues. + See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + ``` + +## Return Value +- **Success**: Operator uninstalled successfully with summary of removed resources +- **Partial Success**: Some resources removed with warnings about remaining resources +- **Error**: Uninstallation failed with specific error message +- **Format**: Structured output showing: + - Subscription deletion status + - CSV deletion status + - Deployment removal status + - CRD removal status (if applicable) + - Namespace deletion status (if applicable) + +## Examples + +1. **Uninstall cert-manager-operator (basic)**: + ``` + /olm:uninstall openshift-cert-manager-operator + ``` + +2. **Uninstall with custom namespace**: + ``` + /olm:uninstall openshift-cert-manager-operator my-cert-manager + ``` + +3. **Complete cleanup including namespace**: + ``` + /olm:uninstall openshift-cert-manager-operator cert-manager-operator --remove-crds --remove-namespace + ``` + This performs a complete cleanup of all operator-related resources. + +4. **Force uninstall without prompts**: + ``` + /olm:uninstall openshift-cert-manager-operator cert-manager-operator --force + ``` + Skips all confirmation prompts (use with caution!). + +## Arguments +- **$1** (operator-name): The name of the operator to uninstall (required) + - Example: "openshift-cert-manager-operator" + - Must match the Subscription name +- **$2** (namespace): The namespace where operator is installed (optional) + - Default: `{operator-name}` (operator name without "openshift-" prefix) + - Example: "cert-manager-operator" +- **$3+** (flags): Optional flags (can combine multiple): + - `--remove-crds`: Remove Custom Resource Definitions (WARNING: affects entire cluster) + - `--remove-namespace`: Remove the operator's namespace and all its resources + - `--force`: Skip all confirmation prompts (use with caution) + +## Safety Features + +1. **Multiple Confirmations**: Separate confirmations for CRD and namespace removal +2. **Detailed Warnings**: Clear warnings about the scope of deletions +3. **Verification Steps**: Checks that resources exist before attempting deletion +4. **Summary Report**: Detailed summary of what was and wasn't removed +5. **Graceful Failures**: Continues with remaining steps if individual deletions fail + +## Troubleshooting + +- **Subscription not found**: Verify the operator name and namespace: + ```bash + oc get subscriptions --all-namespaces | grep {operator-name} + ``` +- **CSV won't delete**: Check for finalizers: + ```bash + oc get csv {csv-name} -n {namespace} -o yaml | grep finalizers + ``` + If finalizers are present, they may be waiting for resources to be cleaned up. Check operator logs and events. + +- **Namespace stuck in Terminating**: This is a common issue after operator uninstallation. + ```bash + # Find remaining resources + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + + # Check namespace finalizers + oc get namespace {namespace} -o yaml | grep -A5 finalizers + ``` + **IMPORTANT**: Do not force-delete the namespace. This can cause cluster instability. + Instead, use `/olm:diagnose {operator-name} {namespace}` to diagnose and fix the issue. + +- **CRDs won't delete**: Check for remaining custom resources: + ```bash + oc get {crd-name} --all-namespaces + ``` + CRDs cannot be deleted while CR instances exist. Delete all CRs first. + +- **Custom resources won't delete**: Some CRs may have finalizers preventing deletion: + ```bash + oc get -n -o yaml | grep finalizers + ``` + The operator controller (if still running) should remove finalizers. If operator is already deleted, you may need to manually patch the CR to remove finalizers (use with extreme caution). + +- **Permission denied**: Ensure you have cluster-admin privileges for CRD deletion: + ```bash + oc auth can-i delete crd + ``` + +- **Reinstallation fails after uninstall**: This usually means cleanup was incomplete. + Run these checks before reinstalling: + ```bash + # Check for remaining subscriptions/CSVs + oc get subscription,csv -n {namespace} + + # Check for remaining CRDs + oc get crd | grep + + # Check if namespace is clean or stuck + oc get namespace {namespace} + ``` + See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + +## Related Commands + +- `/olm:install` - Install a day-2 operator +- `/olm:list` - List installed operators +- `/olm:status` - Check operator status before uninstalling +- `/olm:diagnose` - Diagnose and fix uninstallation issues +- `/olm:upgrade` - Upgrade an operator + +## Additional Resources + +- [Red Hat OpenShift: Deleting Operators from a cluster](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-deleting-operators-from-a-cluster) +- [Red Hat OpenShift: Reinstalling Operators after failed uninstallation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + diff --git a/plugins/olm/commands/upgrade.md b/plugins/olm/commands/upgrade.md new file mode 100644 index 000000000..75434f615 --- /dev/null +++ b/plugins/olm/commands/upgrade.md @@ -0,0 +1,349 @@ +--- +description: Update an operator to the latest version or switch channels +argument-hint: [namespace] [--channel=] [--approve] +--- + +## Name +olm:upgrade + +## Synopsis +``` +/olm:upgrade [namespace] [--channel=] [--approve] +``` + +## Description +The `olm:upgrade` command updates an installed operator to the latest version in its current channel or switches to a different channel. It can also approve pending InstallPlans for operators with manual approval mode. + +This command helps you: +- Update operators to the latest version in their channel +- Switch operators to different channels (e.g., stable to tech-preview) +- Approve pending upgrade InstallPlans for manual approval mode +- Monitor upgrade progress +- Rollback on failure (if possible via OLM) + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - Name of the operator to upgrade + - `$2`: Namespace (optional) - Namespace where operator is installed + - If not provided, searches for the operator across all namespaces + - `$3+`: Flags (optional): + - `--channel=`: Switch to a different channel + - `--approve`: Automatically approve pending InstallPlan (for manual approval mode) + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has sufficient privileges + +3. **Locate Operator**: + - If namespace provided, verify operator exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If no namespace provided, search across all namespaces: + ```bash + oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' + ``` + - If not found, display error with suggestions + - If multiple instances found, prompt user to specify namespace + +4. **Get Current State**: + - Get current Subscription: + ```bash + oc get subscription {operator-name} -n {namespace} -o json + ``` + - Extract: + - Current channel: `.spec.channel` + - Install plan approval: `.spec.installPlanApproval` + - Installed CSV: `.status.installedCSV` + - Current CSV: `.status.currentCSV` + - Get current CSV version: + ```bash + oc get csv {installed-csv} -n {namespace} -o jsonpath='{.spec.version}' + ``` + +5. **Check for Available Updates**: + - Get PackageManifest: + ```bash + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + - Extract available channels and their latest versions + - If `--channel` flag is specified, verify channel exists + - If no channel flag, check for updates in current channel + - Compare current version with latest available version + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators + +6. **Display Upgrade Plan**: + ``` + Operator Upgrade Plan: + + Operator: {display-name} + Namespace: {namespace} + Current Version: {current-version} + Current Channel: {current-channel} + + [If switching channels:] + Target Channel: {new-channel} + Target Version: {new-version} + + [If upgrading in same channel:] + Latest Version: {latest-version} (in channel: {current-channel}) + + Approval Mode: {Automatic|Manual} + ``` + +7. **Check for Pending InstallPlans** (for manual approval mode): + - Get pending InstallPlans: + ```bash + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + ``` + - If pending InstallPlan exists and `--approve` flag is set: + - Display InstallPlan details + - Approve the InstallPlan (skip to step 9) + - If pending InstallPlan exists and no `--approve` flag: + ``` + ⏸️ Pending InstallPlan found (requires manual approval) + + InstallPlan: {installplan-name} + Target Version: {target-version} + + To approve: /olm:upgrade {operator-name} {namespace} --approve + Or use: /olm:approve {operator-name} {namespace} + ``` + - Exit, waiting for user to approve + +8. **Perform Channel Switch** (if `--channel` flag provided): + - Confirm with user (unless `--force` flag): + ``` + WARNING: Switching channels may upgrade or downgrade the operator. + + Current: {current-channel} ({current-version}) + Target: {new-channel} ({target-version}) + + Continue? (yes/no) + ``` + - Update Subscription to new channel: + ```bash + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{new-channel}"}}' + ``` + - Display confirmation: + ``` + ✓ Subscription updated to channel: {new-channel} + ``` + +9. **Approve Pending InstallPlan** (if `--approve` flag or automatic approval): + - If approval mode is Manual and `--approve` flag is set: + ```bash + oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + ``` + - Display approval confirmation: + ``` + ✓ InstallPlan approved: {installplan-name} + ``` + +10. **Monitor Upgrade Progress**: + - Wait for new InstallPlan to be created (if switching channels): + ```bash + oc get installplan -n {namespace} -w --timeout=60s + ``` + - Wait for new CSV to reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -w --timeout=300s + ``` + - Display progress updates: + ``` + 🔄 Upgrade in progress... + ⏳ Waiting for InstallPlan to complete... + ⏳ New CSV installing: {new-csv-name} + ⏳ Old CSV replacing: {old-csv-name} + ``` + - Poll every 10 seconds to check status + - Timeout: 10 minutes for upgrade to complete + +11. **Verify Upgrade Success**: + - Check new CSV status: + ```bash + oc get csv -n {namespace} -o json + ``` + - Verify new CSV phase is "Succeeded" + - Get new version: + ```bash + oc get csv {new-csv-name} -n {namespace} -o jsonpath='{.spec.version}' + ``` + - Check deployments are healthy: + ```bash + oc get deployments -n {namespace} + ``` + - Check pods are running: + ```bash + oc get pods -n {namespace} + ``` + +12. **Display Upgrade Summary**: + ``` + ✓ Operator Upgrade Complete! + + Operator: {display-name} + Namespace: {namespace} + Previous Version: {old-version} + Current Version: {new-version} + Channel: {channel} + + Deployment Status: + - {deployment-1}: 1/1 replicas ready + - {deployment-2}: 1/1 replicas ready + + To check status: /olm:status {operator-name} {namespace} + ``` + +13. **Handle Upgrade Failures**: + - If upgrade fails or times out: + ``` + ❌ Operator upgrade failed + + Current State: + - CSV: {csv-name} (Phase: {phase}) + - Message: {error-message} + + Troubleshooting steps: + 1. Check CSV status: oc describe csv {csv-name} -n {namespace} + 2. Check events: oc get events -n {namespace} --sort-by='.lastTimestamp' + 3. Check InstallPlan: oc get installplan -n {namespace} + 4. Run diagnostics: /olm:diagnose {operator-name} {namespace} + + To rollback (if OLM supports): + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{old-channel}"}}' + ``` + +## Return Value +- **Success**: Operator upgraded successfully with new version details +- **Pending Approval**: Upgrade waiting for manual approval with instructions +- **No Update Available**: Operator is already at the latest version +- **Error**: Upgrade failed with specific error message and troubleshooting guidance +- **Format**: Structured output showing: + - Previous and current versions + - Channel information + - Deployment and pod status + - Next steps or related commands + +## Examples + +1. **Check for and install updates in current channel**: + ``` + /olm:upgrade openshift-cert-manager-operator + ``` + +2. **Upgrade with specific namespace**: + ``` + /olm:upgrade external-secrets-operator eso-operator + ``` + +3. **Switch to a different channel**: + ``` + /olm:upgrade openshift-cert-manager-operator cert-manager-operator --channel=tech-preview-v1.14 + ``` + This switches from stable-v1 to tech-preview-v1.14 channel. + +4. **Approve pending upgrade (manual approval mode)**: + ``` + /olm:upgrade openshift-cert-manager-operator --approve + ``` + +5. **Switch channel and approve in one command**: + ``` + /olm:upgrade prometheus prometheus-operator --channel=beta --approve + ``` + +## Arguments +- **$1** (operator-name): Name of the operator to upgrade (required) + - Example: "openshift-cert-manager-operator" + - Must match the operator's Subscription name +- **$2** (namespace): Namespace where operator is installed (optional) + - If not provided, searches all namespaces + - Example: "cert-manager-operator" +- **$3+** (flags): Optional flags + - `--channel=`: Switch to specified channel + - Example: `--channel=stable-v1`, `--channel=tech-preview` + - Triggers upgrade/downgrade to the version in that channel + - `--approve`: Automatically approve pending InstallPlan + - Only needed for operators with Manual approval mode + - Equivalent to `/olm:approve` command + +## Notes + +- **Automatic Updates**: Operators with `installPlanApproval: Automatic` will upgrade automatically when new versions are available in their channel +- **Manual Approval**: Operators with `installPlanApproval: Manual` require explicit approval via `--approve` flag or `/olm:approve` command +- **Channel Switching**: Changing channels may result in upgrade or downgrade depending on the versions in each channel +- **Rollback**: OLM has limited rollback support. Switching back to the previous channel may work, but data migration issues may occur +- **Upgrade Timing**: Upgrades happen according to the operator's upgrade strategy (some may cause downtime) + +## Troubleshooting + +- **No updates available**: + ```bash + # Check current version + oc get csv -n {namespace} + + # Check available versions + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + +- **Upgrade stuck or pending**: + ```bash + # Check InstallPlan status + oc get installplan -n {namespace} + + # Check for events + oc get events -n {namespace} --sort-by='.lastTimestamp' | tail -20 + ``` + +- **Manual approval required**: + ```bash + # List pending InstallPlans + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + + # Approve specific InstallPlan + /olm:approve {operator-name} {namespace} + ``` + +- **Upgrade failed**: + ```bash + # Check CSV status + oc describe csv -n {namespace} + + # Check operator logs + oc logs -n {namespace} deployment/{operator-deployment} + + # Run diagnostics + /olm:diagnose {operator-name} {namespace} + ``` + +- **Rollback needed**: + - OLM doesn't have built-in rollback + - Can try switching back to previous channel, but may have issues: + ```bash + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{old-channel}"}}' + ``` + - Consider backup/restore of custom resources before upgrading + +## Related Commands + +- `/olm:status ` - Check current version and available updates +- `/olm:approve ` - Approve pending InstallPlans +- `/olm:install ` - Install an operator +- `/olm:diagnose ` - Diagnose upgrade issues + +## Additional Resources + +- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) +- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + + From 8de00e0f41511c7d23c49743f1e4f0c3a4ea8bac Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 6 May 2026 06:35:48 -0400 Subject: [PATCH 6/7] chore: process /drop directive from pruning PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dropped (undoing previous save): - plugins/olm/commands/approve.md — dropped by @stbenjam - plugins/olm/commands/catalog.md — dropped by @stbenjam - plugins/olm/commands/install.md — dropped by @stbenjam - plugins/olm/commands/opm.md — dropped by @stbenjam - plugins/olm/commands/uninstall.md — dropped by @stbenjam - plugins/olm/commands/upgrade.md — dropped by @stbenjam Removed plugins/olm/ from .pruneprotect. Bumped olm plugin version 0.1.1 → 0.1.2. Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 2 +- .pruneprotect | 2 - PLUGINS.md | 6 - docs/data.json | 38 +-- plugins/olm/.claude-plugin/plugin.json | 2 +- plugins/olm/commands/approve.md | 305 ----------------- plugins/olm/commands/catalog.md | 433 ------------------------- plugins/olm/commands/install.md | 272 ---------------- plugins/olm/commands/opm.md | 359 -------------------- plugins/olm/commands/uninstall.md | 392 ---------------------- plugins/olm/commands/upgrade.md | 349 -------------------- 11 files changed, 3 insertions(+), 2157 deletions(-) delete mode 100644 plugins/olm/commands/approve.md delete mode 100644 plugins/olm/commands/catalog.md delete mode 100644 plugins/olm/commands/install.md delete mode 100644 plugins/olm/commands/opm.md delete mode 100644 plugins/olm/commands/uninstall.md delete mode 100644 plugins/olm/commands/upgrade.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 2e0ea0ca7..d85e949e8 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -56,7 +56,7 @@ "name": "olm", "source": "./plugins/olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.1" + "version": "0.1.2" }, { "name": "olm-team", diff --git a/.pruneprotect b/.pruneprotect index f63330f29..201631758 100644 --- a/.pruneprotect +++ b/.pruneprotect @@ -17,5 +17,3 @@ plugins/ci/ plugins/hcp/ plugins/sosreport/ -# Saved by @stbenjam on 2026-05-05 -plugins/olm/ diff --git a/PLUGINS.md b/PLUGINS.md index 6e7dafd34..b5464b582 100644 --- a/PLUGINS.md +++ b/PLUGINS.md @@ -189,17 +189,11 @@ See [plugins/must-gather/README.md](plugins/must-gather/README.md) for detailed OLM (Operator Lifecycle Manager) plugin for operator management and debugging **Commands:** -- **`/olm:approve` ` [namespace] [--all]`** - Approve pending InstallPlans for operator installations and upgrades -- **`/olm:catalog` ` [arguments]`** - Manage catalog sources for discovering and installing operators - **`/olm:debug` ` [olm-version]`** - Debug OLM issues using must-gather logs and source code analysis - **`/olm:diagnose` `[operator-name] [namespace] [--fix] [--cluster]`** - Diagnose and optionally fix common OLM and operator issues -- **`/olm:install` ` [namespace] [channel] [source] [--approval=Automatic|Manual]`** - Install a day-2 operator using Operator Lifecycle Manager - **`/olm:list` `[namespace] [--all-namespaces]`** - List installed operators in the cluster -- **`/olm:opm` ` [arguments...]`** - Execute opm (Operator Package Manager) commands for building and managing operator catalogs - **`/olm:search` `[query] [--catalog ]`** - Search for available operators in catalog sources - **`/olm:status` ` [namespace]`** - Get detailed status and health information for an operator -- **`/olm:uninstall` ` [namespace] [--remove-crds] [--remove-namespace]`** - Uninstall a day-2 operator and optionally remove its resources -- **`/olm:upgrade` ` [namespace] [--channel=] [--approve]`** - Update an operator to the latest version or switch channels See [plugins/olm/README.md](plugins/olm/README.md) for detailed documentation. diff --git a/docs/data.json b/docs/data.json index 028db3915..d93689f0d 100644 --- a/docs/data.json +++ b/docs/data.json @@ -851,18 +851,6 @@ }, { "commands": [ - { - "argument_hint": " [namespace] [--all]", - "description": "Approve pending InstallPlans for operator installations and upgrades", - "name": "approve", - "synopsis": "/olm:approve [namespace] [--all]" - }, - { - "argument_hint": " [arguments]", - "description": "Manage catalog sources for discovering and installing operators", - "name": "catalog", - "synopsis": "/olm:catalog list" - }, { "argument_hint": " [olm-version]", "description": "Debug OLM issues using must-gather logs and source code analysis", @@ -875,24 +863,12 @@ "name": "diagnose", "synopsis": "/olm:diagnose [operator-name] [namespace] [--fix] [--cluster]" }, - { - "argument_hint": " [namespace] [channel] [source] [--approval=Automatic|Manual]", - "description": "Install a day-2 operator using Operator Lifecycle Manager", - "name": "install", - "synopsis": "/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual]" - }, { "argument_hint": "[namespace] [--all-namespaces]", "description": "List installed operators in the cluster", "name": "list", "synopsis": "/olm:list [namespace] [--all-namespaces]" }, - { - "argument_hint": " [arguments...]", - "description": "Execute opm (Operator Package Manager) commands for building and managing operator catalogs", - "name": "opm", - "synopsis": "/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=]" - }, { "argument_hint": "[query] [--catalog ]", "description": "Search for available operators in catalog sources", @@ -904,18 +880,6 @@ "description": "Get detailed status and health information for an operator", "name": "status", "synopsis": "/olm:status [namespace]" - }, - { - "argument_hint": " [namespace] [--remove-crds] [--remove-namespace]", - "description": "Uninstall a day-2 operator and optionally remove its resources", - "name": "uninstall", - "synopsis": "/olm:uninstall [namespace] [--remove-crds] [--remove-namespace]" - }, - { - "argument_hint": " [namespace] [--channel=] [--approve]", - "description": "Update an operator to the latest version or switch channels", - "name": "upgrade", - "synopsis": "/olm:upgrade [namespace] [--channel=] [--approve]" } ], "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", @@ -923,7 +887,7 @@ "hooks": [], "name": "olm", "skills": [], - "version": "0.1.1" + "version": "0.1.2" }, { "commands": [ diff --git a/plugins/olm/.claude-plugin/plugin.json b/plugins/olm/.claude-plugin/plugin.json index b6958e1ed..fb2925423 100644 --- a/plugins/olm/.claude-plugin/plugin.json +++ b/plugins/olm/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.1", + "version": "0.1.2", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/olm/commands/approve.md b/plugins/olm/commands/approve.md deleted file mode 100644 index 3aa5437b0..000000000 --- a/plugins/olm/commands/approve.md +++ /dev/null @@ -1,305 +0,0 @@ ---- -description: Approve pending InstallPlans for operator installations and upgrades -argument-hint: [namespace] [--all] ---- - -## Name -olm:approve - -## Synopsis -``` -/olm:approve [namespace] [--all] -``` - -## Description -The `olm:approve` command approves pending InstallPlans for operators with manual approval mode. This is required for operators that have `installPlanApproval: Manual` in their Subscription to proceed with installation or upgrades. - -This command helps you: -- Approve operator installations that are waiting for manual approval -- Approve operator upgrades -- Review what will be installed/upgraded before approval -- Batch approve multiple pending InstallPlans - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - Name of the operator - - `$2`: Namespace (optional) - Namespace where operator is installed - - If not provided, searches for the operator across all namespaces - - `$3`: Flag (optional): - - `--all`: Approve all pending InstallPlans in the namespace - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has sufficient privileges - -3. **Locate Operator**: - - If namespace provided, verify operator exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If no namespace provided, search across all namespaces: - ```bash - oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' - ``` - - If not found, display error with suggestions - -4. **Check Subscription Approval Mode**: - - Get Subscription approval mode: - ```bash - oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' - ``` - - If mode is "Automatic", display informational message: - ``` - ℹ️ Operator '{operator-name}' has automatic approval enabled. - InstallPlans are approved automatically and don't require manual intervention. - - Current Subscription approval mode: Automatic - - To switch to manual approval mode: - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"installPlanApproval":"Manual"}}' - ``` - - Exit if automatic (no approval needed) - -5. **Find Pending InstallPlans**: - - Get all InstallPlans for the operator: - ```bash - oc get installplan -n {namespace} -o json - ``` - - Filter for unapproved plans related to this operator: - ```bash - oc get installplan -n {namespace} -o json | \ - jq '.items[] | select(.spec.approved==false and .spec.clusterServiceVersionNames[] | contains("{operator-name}"))' - ``` - - If no pending InstallPlans found: - ``` - ✓ No pending InstallPlans found for operator '{operator-name}' - - The operator is up to date or already approved. - - To check operator status: /olm:status {operator-name} {namespace} - ``` - - Exit with success - -6. **Display InstallPlan Details**: - For each pending InstallPlan, display: - ``` - ⏸️ Pending InstallPlan Found - - InstallPlan: {installplan-name} - Namespace: {namespace} - Phase: {phase} - Approved: false - - ClusterServiceVersions to be installed/upgraded: - - {csv-name-1} ({version-1}) - - {csv-name-2} ({version-2}) - - Resources to be created/updated: - - CustomResourceDefinitions: {crd-count} - - ServiceAccounts: {sa-count} - - ClusterRoles: {role-count} - - Deployments: {deployment-count} - - [If upgrade:] - Current Version: {current-version} - Target Version: {target-version} - ``` - -7. **Request User Confirmation** (unless `--all` or `--force` flag): - - Display confirmation prompt: - ``` - Do you want to approve this InstallPlan? (yes/no) - ``` - - If user says no, skip this InstallPlan - - If user says yes, proceed to approval - -8. **Approve InstallPlan**: - - Patch the InstallPlan to approve it: - ```bash - oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - ``` - - Verify approval: - ```bash - oc get installplan {installplan-name} -n {namespace} -o jsonpath='{.spec.approved}' - ``` - - Display confirmation: - ``` - ✓ InstallPlan approved: {installplan-name} - ``` - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - -9. **Monitor InstallPlan Execution** (optional): - - Watch InstallPlan phase change to "Complete": - ```bash - oc get installplan {installplan-name} -n {namespace} -w --timeout=120s - ``` - - Display progress: - ``` - 🔄 InstallPlan executing... - ⏳ Installing resources... - ``` - -10. **Verify Installation/Upgrade**: - - Wait for CSV to reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -o json | \ - jq -r '.items[] | select(.status.phase=="Succeeded") | .metadata.name' - ``` - - Display result: - ``` - ✓ Operator installation/upgrade complete - - CSV: {csv-name} - Version: {version} - Phase: Succeeded - - To check operator status: /olm:status {operator-name} {namespace} - ``` - -11. **Handle Multiple InstallPlans** (if `--all` flag): - - Process all pending InstallPlans for the operator - - Display summary: - ``` - ✓ Approved {count} InstallPlan(s) - - Approved: - - {installplan-1} - - {installplan-2} - - Monitoring installation progress... - ``` - -12. **Display Approval Summary**: - ``` - ✓ Approval Complete! - - Operator: {operator-name} - Namespace: {namespace} - Approved InstallPlans: {count} - - InstallPlan Status: - - {installplan-1}: Complete - - {installplan-2}: Installing... - - Monitor progress: watch oc get csv,installplan -n {namespace} - ``` - -## Return Value -- **Success**: InstallPlan(s) approved successfully -- **No Pending Plans**: No InstallPlans require approval -- **Automatic Mode**: Operator has automatic approval (no action needed) -- **Error**: Approval failed with specific error message -- **Format**: Structured output showing: - - Approved InstallPlan names - - Installation/upgrade status - - Next steps or related commands - -## Examples - -1. **Approve pending InstallPlan for an operator**: - ``` - /olm:approve openshift-cert-manager-operator - ``` - -2. **Approve with specific namespace**: - ``` - /olm:approve external-secrets-operator eso-operator - ``` - -3. **Approve all pending InstallPlans**: - ``` - /olm:approve openshift-cert-manager-operator cert-manager-operator --all - ``` - This approves all pending InstallPlans for the operator in the namespace. - -4. **Check and approve after upgrade command**: - ``` - /olm:upgrade openshift-cert-manager-operator --channel=tech-preview - # Wait for InstallPlan to be created - /olm:approve openshift-cert-manager-operator - ``` - -## Arguments -- **$1** (operator-name): Name of the operator (required) - - Example: "openshift-cert-manager-operator" - - Must match the operator's Subscription name -- **$2** (namespace): Namespace where operator is installed (optional) - - If not provided, searches all namespaces - - Example: "cert-manager-operator" -- **$3** (flag): Optional flag - - `--all`: Approve all pending InstallPlans for this operator - - Useful when multiple upgrades are pending - - Skips individual confirmation prompts - -## Notes - -- **Manual Approval Mode**: This command only works for operators with `installPlanApproval: Manual` in their Subscription -- **Automatic Operators**: Operators with automatic approval don't need this command -- **Review Before Approval**: Always review what will be installed/upgraded before approving -- **Multiple InstallPlans**: An operator may have multiple pending InstallPlans if updates accumulated while waiting for approval -- **InstallPlan Retention**: Approved InstallPlans remain in the namespace for audit purposes - -## Troubleshooting - -- **No pending InstallPlans**: - ```bash - # List all InstallPlans - oc get installplan -n {namespace} - - # Check if operator is in automatic mode - oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' - ``` - -- **InstallPlan not executing after approval**: - ```bash - # Check InstallPlan status - oc describe installplan {installplan-name} -n {namespace} - - # Check for errors - oc get events -n {namespace} --sort-by='.lastTimestamp' | grep InstallPlan - ``` - -- **CSV not reaching Succeeded phase**: - ```bash - # Check CSV status - oc describe csv -n {namespace} - - # Check operator deployment - oc get deployments -n {namespace} - - # Check operator logs - oc logs -n {namespace} deployment/{operator-deployment} - ``` - -- **Permission denied**: - ```bash - # Check if you can patch InstallPlans - oc auth can-i patch installplan -n {namespace} - ``` - -- **Multiple namespaces found**: - - Specify the namespace explicitly in the command: - ``` - /olm:approve {operator-name} {specific-namespace} - ``` - -## Related Commands - -- `/olm:status ` - Check if InstallPlans are pending approval -- `/olm:upgrade ` - Trigger upgrade and approve in one command -- `/olm:install ` - Install operator with approval mode -- `/olm:list` - List operators and their approval modes - -## Additional Resources - -- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) -- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - diff --git a/plugins/olm/commands/catalog.md b/plugins/olm/commands/catalog.md deleted file mode 100644 index cd43964ea..000000000 --- a/plugins/olm/commands/catalog.md +++ /dev/null @@ -1,433 +0,0 @@ ---- -description: Manage catalog sources for discovering and installing operators -argument-hint: [arguments] ---- - -## Name -olm:catalog - -## Synopsis -``` -/olm:catalog list -/olm:catalog add [--namespace=openshift-marketplace] -/olm:catalog remove [--namespace=openshift-marketplace] -/olm:catalog refresh [--namespace=openshift-marketplace] -/olm:catalog status [--namespace=openshift-marketplace] -``` - -## Description -The `olm:catalog` command manages catalog sources for operator discovery and installation. Catalog sources provide the list of operators available for installation in the cluster. - -This command helps you: -- List all available catalog sources and their health status -- Add custom or private catalog sources -- Remove catalog sources -- Refresh catalog sources to get latest operator updates - -## Implementation - -### Subcommand: list - -1. **Get All CatalogSources**: - ```bash - oc get catalogsource -n openshift-marketplace -o json - ``` - -2. **Parse CatalogSource Data**: - For each catalog, extract: - - Name: `.metadata.name` - - Display Name: `.spec.displayName` - - Publisher: `.spec.publisher` - - Source Type: `.spec.sourceType` (grpc, configmap, etc.) - - Image: `.spec.image` (for grpc type) - - Connection State: `.status.connectionState.lastObservedState` - - Last Updated: `.status.connectionState.lastUpdatedTime` - - Number of Operators: Count from PackageManifests with this catalog - -3. **Get Catalog Pod Status**: - ```bash - oc get pods -n openshift-marketplace -l olm.catalogSource={catalog-name} - ``` - -4. **Format Output**: - ``` - ═══════════════════════════════════════════════════════════ - CATALOG SOURCES - ═══════════════════════════════════════════════════════════ - - NAME STATUS OPERATORS LAST UPDATED SOURCE TYPE - redhat-operators READY 150 2h ago grpc - certified-operators READY 45 3h ago grpc - community-operators READY 200 1h ago grpc - redhat-marketplace READY 30 4h ago grpc - custom-catalog FAILED 0 - grpc - - ═══════════════════════════════════════════════════════════ - DETAILS - ═══════════════════════════════════════════════════════════ - - redhat-operators: - Display Name: Red Hat Operators - Publisher: Red Hat - Image: registry.redhat.io/redhat/redhat-operator-index:v4.20 - Pod: redhat-operators-abc123 (Running) - - custom-catalog (FAILED): - Display Name: Custom Catalog - Publisher: My Company - Image: registry.example.com/custom-catalog:latest - Pod: custom-catalog-xyz789 (CrashLoopBackOff) - Error: ImagePullBackOff - - To troubleshoot: - /olm:catalog status custom-catalog - ``` - -### Subcommand: add - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `image`: Catalog image (required) - - `--namespace`: Target namespace (default: openshift-marketplace) - - `--display-name`: Display name (optional) - - `--publisher`: Publisher name (optional) - -2. **Validate Image**: - - Check if image format is valid - - Optionally test image accessibility (if possible) - -3. **Create CatalogSource Manifest**: - ```yaml - apiVersion: operators.coreos.com/v1alpha1 - kind: CatalogSource - metadata: - name: {name} - namespace: {namespace} - spec: - sourceType: grpc - image: {image} - displayName: {display-name} - publisher: {publisher} - updateStrategy: - registryPoll: - interval: 30m - ``` - -4. **Apply CatalogSource**: - ```bash - oc apply -f /tmp/catalogsource-{name}.yaml - ``` - -5. **Wait for CatalogSource to be Ready**: - ```bash - oc wait --for=condition=READY catalogsource/{name} -n {namespace} --timeout=300s - ``` - -6. **Verify Pod is Running**: - ```bash - oc get pods -n {namespace} -l olm.catalogSource={name} - ``` - -7. **Display Result**: - ``` - ✓ Catalog source added: {name} - - Name: {name} - Namespace: {namespace} - Image: {image} - Status: READY - Pod: {pod-name} (Running) - - To search operators: /olm:search --catalog {name} - ``` - -### Subcommand: remove - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Check if CatalogSource Exists**: - ```bash - oc get catalogsource {name} -n {namespace} --ignore-not-found - ``` - -3. **Check for Operators Using This Catalog**: - ```bash - oc get subscription --all-namespaces -o json | \ - jq -r '.items[] | select(.spec.source=="{name}") | "\(.metadata.namespace)/\(.metadata.name)"' - ``` - -4. **Display Warning** (if operators found): - ``` - WARNING: The following operators are using this catalog: - - namespace-1/operator-1 - - namespace-2/operator-2 - - Removing this catalog will prevent these operators from receiving updates. - - Do you want to continue? (yes/no) - ``` - -5. **Delete CatalogSource**: - ```bash - oc delete catalogsource {name} -n {namespace} - ``` - -6. **Wait for Pod to be Deleted**: - ```bash - oc wait --for=delete pod -l olm.catalogSource={name} -n {namespace} --timeout=60s - ``` - -7. **Display Result**: - ``` - ✓ Catalog source removed: {name} - ``` - -### Subcommand: refresh - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Get Current CatalogSource**: - ```bash - oc get catalogsource {name} -n {namespace} -o json - ``` - -3. **Trigger Refresh by Deleting Pod**: - ```bash - oc delete pod -n {namespace} -l olm.catalogSource={name} - ``` - - This forces OLM to recreate the pod and re-fetch catalog data - -4. **Wait for New Pod to be Ready**: - ```bash - oc wait --for=condition=Ready pod -l olm.catalogSource={name} -n {namespace} --timeout=300s - ``` - -5. **Verify Catalog is Updated**: - ```bash - oc get catalogsource {name} -n {namespace} -o json | \ - jq -r '.status.connectionState.lastUpdatedTime' - ``` - -6. **Display Result**: - ``` - ✓ Catalog source refreshed: {name} - - Last Updated: {timestamp} - Status: READY - Pod: {pod-name} (Running) - - New operators may now be available: /olm:search --catalog {name} - ``` - -### Subcommand: status - -1. **Parse Arguments**: - - `name`: Catalog source name (required) - - `--namespace`: Namespace (default: openshift-marketplace) - -2. **Get CatalogSource Details**: - ```bash - oc get catalogsource {name} -n {namespace} -o json - ``` - -3. **Get Pod Details**: - ```bash - oc get pods -n {namespace} -l olm.catalogSource={name} -o json - ``` - -4. **Get Recent Events**: - ```bash - oc get events -n {namespace} --field-selector involvedObject.name={name} --sort-by='.lastTimestamp' - ``` - -5. **Count Available Operators**: - ```bash - oc get packagemanifests -n openshift-marketplace -o json | \ - jq -r '.items[] | select(.status.catalogSource=="{name}") | .metadata.name' | wc -l - ``` - -6. **Verify Catalog Connectivity**: - - Check if catalog is serving content by verifying PackageManifest count > 0 - - If count is 0 but pod is Running, indicates connectivity or catalog index issues - - Review catalog pod logs for gRPC errors, image pull issues, or index corruption: - ```bash - oc logs -n {namespace} {catalog-pod-name} - ``` - -7. **Format Comprehensive Status Report**: - ``` - ═══════════════════════════════════════════════════════════ - CATALOG SOURCE STATUS: {name} - ═══════════════════════════════════════════════════════════ - - General Information: - Name: {name} - Namespace: {namespace} - Display Name: {display-name} - Publisher: {publisher} - Source Type: {source-type} - Image: {image} - - Connection Status: - State: {state} (READY | CONNECTING | CONNECTION_FAILED) - Last Updated: {timestamp} - Last Successful: {timestamp} - - Pod Status: - Name: {pod-name} - Status: {status} (Running | CrashLoopBackOff | ImagePullBackOff) - Ready: {ready-containers}/{total-containers} - Restarts: {restart-count} - Age: {age} - - Catalog Content: - Operators Available: {count} - - [If issues detected:] - ⚠️ Issues Detected: - - Pod in CrashLoopBackOff - - Last update: 24h ago (stale) - - Connection state: CONNECTION_FAILED - - Recent Events: - {timestamp} Warning: Failed to pull image - {timestamp} Warning: Back-off restarting failed container - - Troubleshooting Steps: - 1. Check pod logs: oc logs -n {namespace} {pod-name} - 2. Check image accessibility - 3. Refresh catalog: /olm:catalog refresh {name} - 4. Verify network connectivity (for disconnected environments) - - Related Commands: - - Refresh: /olm:catalog refresh {name} - - List operators: /olm:search --catalog {name} - ``` - -## Return Value -- **list**: Table of all catalog sources with status -- **add**: Confirmation of added catalog with details -- **remove**: Confirmation of removed catalog -- **refresh**: Confirmation of refresh with updated timestamp -- **status**: Comprehensive status report for specific catalog - -## Examples - -1. **List all catalog sources**: - ``` - /olm:catalog list - ``` - -2. **Add custom catalog**: - ``` - /olm:catalog add my-catalog registry.example.com/my-catalog:v1.0 - ``` - -3. **Add catalog with metadata**: - ``` - /olm:catalog add my-catalog registry.example.com/catalog:latest \ - --display-name="My Custom Catalog" \ - --publisher="My Company" - ``` - -4. **Remove catalog**: - ``` - /olm:catalog remove my-catalog - ``` - -5. **Refresh catalog to get latest operators**: - ``` - /olm:catalog refresh redhat-operators - ``` - -6. **Check catalog health**: - ``` - /olm:catalog status custom-catalog - ``` - -7. **Add catalog for disconnected environment**: - ``` - /olm:catalog add disconnected-operators \ - mirror-registry.local:5000/olm/redhat-operators:v4.20 \ - --namespace=openshift-marketplace - ``` - -## Arguments - -### list -No arguments required. - -### add -- **name** (required): Name for the catalog source -- **image** (required): Container image containing the catalog -- **--namespace**: Target namespace (default: openshift-marketplace) -- **--display-name**: Human-readable display name -- **--publisher**: Publisher/organization name - -### remove -- **name** (required): Name of the catalog source to remove -- **--namespace**: Namespace (default: openshift-marketplace) - -### refresh -- **name** (required): Name of the catalog source to refresh -- **--namespace**: Namespace (default: openshift-marketplace) - -### status -- **name** (required): Name of the catalog source to check -- **--namespace**: Namespace (default: openshift-marketplace) - -## Troubleshooting - -- **Catalog pod failing**: - ```bash - # Check pod logs - oc logs -n openshift-marketplace {catalog-pod-name} - - # Check image pull issues - oc describe pod -n openshift-marketplace {catalog-pod-name} - ``` - -- **No operators showing up**: - ```bash - # Verify catalog is ready - /olm:catalog status {catalog-name} - - # Check PackageManifests - oc get packagemanifests -n openshift-marketplace - ``` - -- **Image pull errors (disconnected environment)**: - - Verify image registry is accessible - - Check pull secrets are configured - - Ensure image has been mirrored correctly - -- **Stale catalog data**: - ```bash - # Force refresh - /olm:catalog refresh {catalog-name} - ``` - -- **Connection failures**: - ```bash - # Check catalog source definition - oc get catalogsource {catalog-name} -n openshift-marketplace -o yaml - - # Run cluster diagnostics - /olm:diagnose --cluster - ``` - -## Related Commands - -- `/olm:search` - Search for operators in catalogs -- `/olm:install` - Install operators from catalogs -- `/olm:diagnose` - Diagnose catalog health issues - -## Additional Resources -- [Building Catalog Images with opm](https://olm.operatorframework.io/docs/tasks/creating-catalog-from-index/) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - diff --git a/plugins/olm/commands/install.md b/plugins/olm/commands/install.md deleted file mode 100644 index ccc0bcfc2..000000000 --- a/plugins/olm/commands/install.md +++ /dev/null @@ -1,272 +0,0 @@ ---- -description: Install a day-2 operator using Operator Lifecycle Manager -argument-hint: [namespace] [channel] [source] [--approval=Automatic|Manual] ---- - -## Name -olm:install - -## Synopsis -``` -/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual] -``` - -## Description -The `olm:install` command installs a day-2 operator in an OpenShift cluster using Operator Lifecycle Manager (OLM). It automates the creation of the required namespace, OperatorGroup, and Subscription resources needed to install an operator. - -This command handles the complete operator installation workflow: -- Creates or verifies the target namespace exists -- Creates an OperatorGroup if needed -- Creates a Subscription to install the operator -- Verifies the installation by checking the operator's CSV (ClusterServiceVersion) status -- Provides detailed feedback on the installation progress - -The command is designed to work with operators from the OperatorHub catalog, including Red Hat certified operators, community operators, and custom catalog sources. - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - The name of the operator to install (e.g., "openshift-cert-manager-operator") - - `$2`: Namespace (optional) - Target namespace for the operator. If not provided, defaults to `{operator-name}-operator` (e.g., "cert-manager-operator") - - `$3`: Channel (optional) - Subscription channel. If not provided, discovers the default channel from the operator's PackageManifest - - `$4`: Source (optional) - CatalogSource name. Defaults to "redhat-operators" for Red Hat operators - - `$5+`: Flags (optional): - - `--approval=Automatic|Manual`: InstallPlan approval mode (default: Automatic) - - Automatic: Operator upgrades are automatically installed - - Manual: Operator upgrades require manual approval via `/olm:approve` or `oc patch` - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has cluster-admin or sufficient privileges - - If not installed or not authenticated, provide clear instructions - -3. **Discover Operator Metadata** (if channel or source not provided): - - Search for the operator in available catalogs: - ```bash - oc get packagemanifests -n openshift-marketplace | grep {operator-name} - ``` - - Get the PackageManifest details: - ```bash - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - - Extract: - - Default channel: `.status.defaultChannel` - - CatalogSource: `.status.catalogSource` - - CatalogSourceNamespace: `.status.catalogSourceNamespace` - - If operator not found, provide error with list of available operators - -4. **Create Namespace**: - - Check if namespace exists: `oc get namespace {namespace} --ignore-not-found` - - If not exists, create it: - ```bash - oc create namespace {namespace} - ``` - - If exists, inform user and continue - -5. **Create OperatorGroup**: - - Check if OperatorGroup exists in the namespace: - ```bash - oc get operatorgroup -n {namespace} --ignore-not-found - ``` - - If no OperatorGroup exists, create one: - ```yaml - apiVersion: operators.coreos.com/v1 - kind: OperatorGroup - metadata: - name: {namespace}-operatorgroup - namespace: {namespace} - spec: - targetNamespaces: - - {namespace} - ``` - - Save to temporary file and apply: - ```bash - oc apply -f /tmp/operatorgroup-{operator-name}.yaml - ``` - - If OperatorGroup already exists, inform user and continue - -6. **Create Subscription**: - - Parse approval mode from flags (default: Automatic) - - Create Subscription manifest: - ```yaml - apiVersion: operators.coreos.com/v1alpha1 - kind: Subscription - metadata: - name: {operator-name} - namespace: {namespace} - spec: - channel: {channel} - name: {operator-name} - source: {source} - sourceNamespace: openshift-marketplace - installPlanApproval: {Automatic|Manual} - ``` - - Save to temporary file and apply: - ```bash - oc apply -f /tmp/subscription-{operator-name}.yaml - ``` - - Display the created subscription details - - If approval mode is Manual, display informational message: - ``` - ℹ️ InstallPlan approval set to Manual - You will need to manually approve InstallPlans for this operator. - Use: /olm:approve {operator-name} {namespace} - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - ``` - -7. **Verify Installation**: - - Wait for InstallPlan to be created: - ```bash - oc get installplan -n {namespace} -l operators.coreos.com/operator={operator-name} - ``` - - If approval mode is Manual, check if InstallPlan needs approval: - ```bash - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - ``` - - If Manual and not approved, display message: - ``` - ⏸️ InstallPlan created but requires manual approval - - InstallPlan: {installplan-name} - To approve: /olm:approve {operator-name} {namespace} - Or manually: oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - - Waiting for approval... - ``` - - Wait for CSV to be created and reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -w - ``` - - Use a timeout of 5 minutes for the installation to complete (10 minutes if Manual approval) - - Poll every 10 seconds to check CSV status - - Display progress updates to the user - -8. **Display Results**: - - Show the installed operator's CSV name and version - - Show the operator deployment status: - ```bash - oc get deployments -n {namespace} - ``` - - List any pods created by the operator: - ```bash - oc get pods -n {namespace} - ``` - - Display success message with next steps or usage instructions - -9. **Cleanup Temporary Files**: - - Remove temporary YAML files created during installation: - ```bash - rm -f /tmp/operatorgroup-{operator-name}.yaml /tmp/subscription-{operator-name}.yaml - ``` - -## Return Value -- **Success**: Operator installed successfully with details about the CSV, deployments, and pods -- **Error**: Installation failed with specific error message and troubleshooting suggestions -- **Format**: Structured output showing: - - Namespace created/used - - OperatorGroup status - - Subscription created - - CSV status and version - - Deployment and pod status - -## Examples - -1. **Install cert-manager-operator with defaults**: - ``` - /olm:install openshift-cert-manager-operator - ``` - This will: - - Create namespace `cert-manager-operator` - - Discover default channel from PackageManifest - - Use `redhat-operators` catalog source - - Install the operator - -2. **Install cert-manager-operator with custom namespace**: - ``` - /olm:install openshift-cert-manager-operator my-cert-manager - ``` - This will install the operator in the `my-cert-manager` namespace. - -3. **Install with specific channel**: - ``` - /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 - ``` - This will install from the `stable-v1` channel. - -4. **Install from community catalog**: - ``` - /olm:install prometheus community-operators stable community-operators - ``` - This will install Prometheus from the community-operators catalog. - -5. **Install Red Hat Advanced Cluster Security**: - ``` - /olm:install rhacs-operator rhacs-operator stable - ``` - -6. **Install with manual approval mode**: - ``` - /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 redhat-operators --approval=Manual - ``` - This will install the operator but require manual approval for all upgrades. - -7. **Install with all parameters specified**: - ``` - /olm:install external-secrets-operator eso-operator stable-v0.10 redhat-operators --approval=Automatic - ``` - -## Arguments -- **$1** (operator-name): The name of the operator to install (required) - - Example: "openshift-cert-manager-operator" - - Must match the name in the operator's PackageManifest -- **$2** (namespace): Target namespace for the operator installation (optional) - - Default: `{operator-name}` (operator name without "openshift-" prefix if present) - - Example: "cert-manager-operator" -- **$3** (channel): Subscription channel (optional) - - Default: Auto-discovered from PackageManifest's default channel - - Example: "stable-v1", "tech-preview", "stable" -- **$4** (source): CatalogSource name (optional) - - Default: "redhat-operators" - - Other options: "certified-operators", "community-operators", "redhat-marketplace" -- **$5+** (flags): Optional flags - - `--approval=Automatic|Manual`: InstallPlan approval mode - - **Automatic** (default): Operator upgrades are automatically installed without user intervention - - **Manual**: Operator upgrades require explicit approval. Useful for: - - Production environments requiring change control - - Testing upgrades before applying - - Preventing unexpected operator updates - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators - -## Notes - -- **Automatic Channel Discovery**: If no channel is specified, the command automatically discovers and uses the operator's default channel from its PackageManifest -- **Namespace Convention**: By default, operators are installed in a namespace following the pattern `{operator-name}-operator` -- **OperatorGroup Scope**: The created OperatorGroup targets only the installation namespace for better isolation -- **InstallPlan Approval**: Set to "Automatic" by default for seamless installation. Can be changed to "Manual" using `--approval=Manual` flag -- **Manual Approval Mode**: When using `--approval=Manual`: - - Initial installation may require manual approval of the InstallPlan - - All future upgrades will require explicit approval via `/olm:approve` command - - Provides better control over operator updates in production environments -- **Verification Timeout**: The command waits up to 5 minutes for the operator to install successfully (10 minutes for manual approval mode) -- **Cleanup**: Temporary YAML files are automatically removed after installation - -## Troubleshooting - -- **Operator not found**: Run `oc get packagemanifests -n openshift-marketplace` to see available operators -- **Permission denied**: Ensure you have cluster-admin privileges or the necessary RBAC permissions -- **Installation timeout**: Check the InstallPlan and CSV status manually: - ```bash - oc get installplan -n {namespace} - oc get csv -n {namespace} - oc describe csv -n {namespace} - ``` -- **Operator pod not starting**: Check pod logs: - ```bash - oc logs -n {namespace} deployment/{operator-deployment} - ``` - diff --git a/plugins/olm/commands/opm.md b/plugins/olm/commands/opm.md deleted file mode 100644 index bd30fe0de..000000000 --- a/plugins/olm/commands/opm.md +++ /dev/null @@ -1,359 +0,0 @@ ---- -description: Execute opm (Operator Package Manager) commands for building and managing operator catalogs -argument-hint: [arguments...] ---- - -## Name -olm:opm - -## Synopsis -```bash -/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] -/olm:opm list packages -/olm:opm list channels [package-name] -/olm:opm list bundles [package-name] -``` - -## Description -The `olm:opm` command provides a unified interface to `opm` (Operator Package Manager) operations for building and managing operator catalog indexes. It supports building catalog indexes, generating semver templates, and querying catalog contents. - -## Arguments -- `$1`: **action** - The action to perform: - - `build-index-image`: Build an index from an existing catalog directory - - `build-semver-index-image`: Build an index from a semver template - - `generate-semver-template`: Generate a semver template file - - `list`: List catalog contents (requires second argument: `packages`, `channels`, or `bundles`) -- `$2+`: Additional arguments specific to each action (see Actions section below) - -## Actions - -### build-index-image -Build an operator catalog index image from an existing catalog directory. - -**Synopsis:** -```bash -/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -``` - -**Arguments:** -- `$2`: **catalog-path** - Path to the catalog directory containing the index configuration -- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) -- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) -- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) -- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) -- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) - -**Examples:** -```bash -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --cacheless -/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 -``` - -### build-semver-index-image -Build a multi-architecture operator catalog index image using the semver template format. - -**Synopsis:** -```bash -/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] -``` - -**Arguments:** -- `$2`: **semver-template-file** - Path to the semver template configuration file (e.g., `catalog-config.yaml`) -- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) -- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) -- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) -- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) -- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) - -**Examples:** -```bash -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --cacheless -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 -/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=multi -``` - -### generate-semver-template -Generate a semver template configuration file for building operator catalogs. - -**Synopsis:** -```bash -/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] -``` - -**Arguments:** -- `$2`: **bundle-list** - Comma-separated list of bundle image references (e.g., `quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1`) -- `--output=`: Optional output file path (default: `catalog-semver-config.yaml` in current directory) -- `--major=true|false`: Optional flag to generate major version channels (default: `true`) -- `--minor=true|false`: Optional flag to generate minor version channels (default: `false`) - -**Examples:** -```bash -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 --output=my-catalog.yaml -/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.1.0 --minor=true -``` - -### list packages -List all operator packages available in a catalog index. - -**Synopsis:** -```bash -/olm:opm list packages -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **packages** - Must be "packages" -- `$4`: **index-ref** - Catalog index reference, either: - - Image tag: `quay.io/myorg/mycatalog:v1.0.0` - - Directory path: `./catalog` or `/path/to/catalog` - -**Examples:** -```bash -/olm:opm list packages quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list packages ./catalog -``` - -### list channels -List channels for operator packages in a catalog index. - -**Synopsis:** -```bash -/olm:opm list channels [package-name] -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **channels** - Must be "channels" -- `$4`: **index-ref** - Catalog index reference (image tag or directory path) -- `$5`: **package-name** (Optional) - Name of a specific package to list channels for - -**Examples:** -```bash -/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 nginx85187 -/olm:opm list channels ./catalog -``` - -### list bundles -List bundles for operator packages in a catalog index. - -**Synopsis:** -```bash -/olm:opm list bundles [package-name] -``` - -**Arguments:** -- `$2`: **list** - Must be "list" -- `$3`: **bundles** - Must be "bundles" -- `$4`: **index-ref** - Catalog index reference (image tag or directory path) -- `$5`: **package-name** (Optional) - Name of a specific package to list bundles for - -**Examples:** -```bash -/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 -/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 nginx85187 -/olm:opm list bundles ./catalog -``` - -## Implementation - -### Step 1: Parse Action -- Extract the action from `$1` -- Validate the action is one of: `build-index-image`, `build-semver-index-image`, `generate-semver-template`, `list` -- If invalid action, display error with available actions - -### Step 2: Check Prerequisites -Verify required tools are installed: -- Check for `opm`: `which opm` - - If not found, provide installation instructions: -- For build actions, also check for `podman`: `which podman` - - If not found, provide installation instructions based on user's platform - -### Step 3: Route to Action Handler -Based on the action, call the appropriate implementation: - -#### For `build-index-image`: -1. **Parse Arguments and Set Defaults** - - Extract catalog path from `$2` - - Extract index image tag from `$3` - - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` - - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` - -2. **Verify Catalog Directory** - - Check catalog directory exists: `test -d ` - -3. **Validate Catalog** - ```bash - opm validate - ``` - -4. **Generate Dockerfile** - - If cacheless: `opm generate dockerfile --base-image=scratch` - - If normal: `opm generate dockerfile -b -i ` - -5. **Determine Build Platform** - - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` - - Otherwise: `linux/` - -6. **Create Podman Manifest** - ```bash - podman manifest rm 2>/dev/null || true - podman manifest create - ``` - -7. **Build Image** - ```bash - podman build --platform --manifest . -f catalog.Dockerfile - ``` - -8. **Push Manifest** - ```bash - podman manifest push - ``` - -9. **List Bundles in Index** - ```bash - opm alpha list bundles - ``` - -10. **Display Success Message** - -#### For `build-semver-index-image`: -1. **Parse Arguments and Set Defaults** - - Extract semver template file from `$2` - - Extract index image tag from `$3` - - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` - - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` - -2. **Verify Template File** - - Check file exists: `test -f ` - -3. **Create Catalog and Render Template** - ```bash - mkdir -p catalog - opm alpha render-template semver -o yaml > catalog/index.yaml - ``` - -4. **Validate Catalog** - ```bash - opm validate catalog - ``` - -5. **Generate Dockerfile** - - If cacheless: `opm generate dockerfile catalog --base-image=scratch` - - If normal: `opm generate dockerfile catalog -b -i ` - -6. **Determine Build Platform** - - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` - - Otherwise: `linux/` - -7. **Create Podman Manifest** - ```bash - podman manifest rm 2>/dev/null || true - podman manifest create - ``` - -8. **Build Image** - ```bash - podman build --platform --manifest . -f catalog.Dockerfile - ``` - -9. **Push Manifest** - ```bash - podman manifest push - ``` - -10. **List Bundles in Index** - ```bash - opm alpha list bundles - ``` - -11. **Display Success Message** - -#### For `generate-semver-template`: -1. **Parse Arguments and Set Defaults** - - Extract bundle list from `$2` - - Parse optional flags: `--output`, `--major`, `--minor` - - Set defaults: output=`catalog-semver-config.yaml`, major=`true`, minor=`false` - -2. **Validate Bundle List** - - Split by commas - - Validate each bundle is a valid image reference - -3. **Generate YAML Content** - ```yaml - Schema: olm.semver - GenerateMajorChannels: - GenerateMinorChannels: - Candidate: - Bundles: - - Image: - - Image: - ``` - -4. **Write Template File** - - Check if file exists and confirm overwrite if needed - - Write YAML content - -5. **Validate Generated File** - - Read back and verify YAML is well-formed - -6. **Display Success Message** - - Show file path, bundles included, settings - - Suggest next step: `/olm:opm build-semver-index-image ` - -#### For `list`: -1. **Parse List Type** - - Extract list type from `$2` (must be `packages`, `channels`, or `bundles`) - - If invalid, display error with available types - -2. **Parse Index Reference and Optional Package** - - Extract index-ref from `$3` - - Extract optional package-name from `$4` (for channels and bundles) - -3. **Determine Reference Type** - - Check if directory: `test -d ` - -4. **Execute List Command** - - For packages: `opm alpha list packages ` - - For channels: `opm alpha list channels [package-name]` - - For bundles: `opm alpha list bundles [package-name]` - -5. **Display Results** - - Show the output with appropriate formatting - - Display count of items found - -## Return Value - -**Format**: Varies by action - -- **build-index-image / build-semver-index-image**: Success message with image tag, architectures, and bundle list -- **generate-semver-template**: Success message with file path and configuration details -- **list**: Table or list of catalog contents - -On failure, displays: -- Clear error message indicating which step/action failed -- Relevant tool output for debugging -- Suggestions for resolution - -## Notes - -- Ensure you are authenticated to container registries before building/pushing images (use `podman login`) -- For build operations, the `catalog.Dockerfile` is created in the current working directory -- Multi-architecture builds can be time-consuming -- Cacheless builds result in smaller images and use `scratch` as the base image -- When using `--cacheless`, the `--base-image` and `--builder-image` options are ignored (scratch is always used as base) -- Index references can be either image tags or local directory paths -- Bundle images must be accessible from where you build the catalog -- Image tags should include the full registry hostname (e.g., `quay.io/org/image:tag` not `quay/org/image:tag`) - -## Related Commands - -- `/olm:install` - Install an operator using OLM -- `/olm:catalog` - Manage catalog sources -- `/olm:debug` - Debug OLM issues diff --git a/plugins/olm/commands/uninstall.md b/plugins/olm/commands/uninstall.md deleted file mode 100644 index 36c3ec14c..000000000 --- a/plugins/olm/commands/uninstall.md +++ /dev/null @@ -1,392 +0,0 @@ ---- -description: Uninstall a day-2 operator and optionally remove its resources -argument-hint: [namespace] [--remove-crds] [--remove-namespace] ---- - -## Name -olm:uninstall - -## Synopsis -``` -/olm:uninstall [namespace] [--remove-crds] [--remove-namespace] -``` - -## Description -The `olm:uninstall` command uninstalls a day-2 operator from an OpenShift cluster by removing its Subscription, ClusterServiceVersion (CSV), and optionally its Custom Resource Definitions (CRDs) and namespace. - -This command provides a comprehensive uninstallation workflow: -- Removes the operator's Subscription -- Deletes the ClusterServiceVersion (CSV) -- Optionally removes operator-managed deployments -- Optionally deletes Custom Resource Definitions (CRDs) -- Optionally removes the operator's namespace -- Provides detailed feedback on each step - -The command is designed to safely clean up operators installed via OLM, with optional flags for thorough cleanup of all operator-related resources. - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - The name of the operator to uninstall - - `$2`: Namespace (optional) - The namespace where operator is installed. If not provided, defaults to `{operator-name}-operator` - - `$3+`: Flags (optional): - - `--remove-crds`: Remove Custom Resource Definitions after uninstalling - - `--remove-namespace`: Remove the operator's namespace after cleanup - - `--force`: Skip confirmation prompts - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has cluster-admin or sufficient privileges - -3. **Verify Operator Installation**: - - Check if namespace exists: - ```bash - oc get namespace {namespace} --ignore-not-found - ``` - - Check if subscription exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If not found, display error: "Operator {operator-name} is not installed in namespace {namespace}" - - List what will be uninstalled - -4. **Display Uninstallation Plan**: - - Show operator details: - ```bash - oc get subscription {operator-name} -n {namespace} -o yaml - oc get csv -n {namespace} - ``` - - Display what will be removed: - - Subscription name and namespace - - CSV name and version - - Deployments (if any) - - CRDs (if `--remove-crds` flag is set) - - Namespace (if `--remove-namespace` flag is set) - -5. **Request User Confirmation** (unless `--force` flag is set): - - Display warning: - ``` - WARNING: You are about to uninstall {operator-name} from namespace {namespace}. - This will remove: - - Subscription: {subscription-name} - - ClusterServiceVersion: {csv-name} - - Operator deployments - [- Custom Resource Definitions (if --remove-crds is set)] - [- Namespace {namespace} (if --remove-namespace is set)] - - Are you sure you want to continue? (yes/no) - ``` - - Wait for user confirmation - - If user says no, abort operation - -6. **Delete Subscription**: - - Remove the operator's subscription: - ```bash - oc delete subscription {operator-name} -n {namespace} - ``` - - Verify deletion: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - Display result - -7. **Delete ClusterServiceVersion (CSV)**: - - Get the CSV name: - ```bash - oc get csv -n {namespace} -o jsonpath='{.items[?(@.spec.displayName contains "{operator-name}")].metadata.name}' - ``` - - Delete the CSV: - ```bash - oc delete csv {csv-name} -n {namespace} - ``` - - This will automatically remove operator deployments - - Verify CSV is deleted: - ```bash - oc get csv -n {namespace} --ignore-not-found - ``` - -8. **Remove Operator Deployments** (if still present): - - List deployments created by the operator: - ```bash - oc get deployments -n {namespace} - ``` - - For operators like cert-manager with labeled resources: - ```bash - oc delete deployment -n {namespace} -l app.kubernetes.io/instance={operator-base-name} - ``` - - Verify deployments are deleted: - ```bash - oc get deployments -n {namespace} - ``` - -8.5. **Check for Orphaned Custom Resources** (before removing CRDs): - - Get list of CRDs managed by the operator from CSV: - ```bash - oc get csv -n {namespace} -o jsonpath='{.items[0].spec.customresourcedefinitions.owned[*].name}' - ``` - - For each CRD, search for CR instances across all namespaces: - ```bash - oc get --all-namespaces --ignore-not-found - ``` - - If CRs exist, list them with details: - ``` - WARNING: Found custom resources that may prevent clean uninstallation: - - namespace-1/ (kind: ) - - namespace-2/ (kind: ) - - These resources should be deleted before uninstalling the operator. - Do you want to delete these custom resources? (yes/no) - ``` - - If user confirms, delete each CR: - ```bash - oc delete -n - ``` - - This prevents namespace from getting stuck in Terminating state - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - -9. **Remove Custom Resource Definitions** (if `--remove-crds` flag is set): - - **WARNING**: Display critical warning to user: - ``` - WARNING: Removing CRDs will delete ALL custom resources of these types across the entire cluster! - This action is irreversible and will affect all namespaces. - - Are you absolutely sure you want to remove CRDs? (yes/no) - ``` - - If user confirms, proceed with CRD removal - - Get list of CRDs owned by the operator: - ```bash - oc get csv {csv-name} -n {namespace} -o jsonpath='{.spec.customresourcedefinitions.owned[*].name}' - ``` - - For each CRD, check if custom resources exist: - ```bash - oc get {crd-name} --all-namespaces --ignore-not-found - ``` - - Display warning if custom resources exist - - Delete CRDs: - ```bash - oc delete crd {crd-name} - ``` - -10. **Remove Namespace** (if `--remove-namespace` flag is set): - - **WARNING**: Display warning: - ``` - WARNING: Removing namespace {namespace} will delete all resources in this namespace! - - Are you sure you want to remove namespace {namespace}? (yes/no) - ``` - - If user confirms: - ```bash - oc delete namespace {namespace} - ``` - - Monitor namespace deletion with timeout: - ```bash - oc wait --for=delete namespace/{namespace} --timeout=120s - ``` - - If namespace gets stuck in "Terminating" state after 120 seconds: - - Check for resources preventing deletion: - ```bash - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - ``` - - Check for finalizers on the namespace: - ```bash - oc get namespace {namespace} -o jsonpath='{.metadata.finalizers}' - ``` - - Display helpful error message: - ``` - ERROR: Namespace {namespace} is stuck in Terminating state. - - Possible causes: - - Resources with finalizers preventing deletion - - API services that are unavailable - - Custom resources that cannot be deleted - - To diagnose and fix, run: /olm:diagnose {operator-name} {namespace} - - Manual troubleshooting: - 1. Check remaining resources: - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - - 2. Check namespace finalizers: - oc get namespace {namespace} -o yaml | grep -A5 finalizers - - WARNING: Do NOT force-delete the namespace as it can lead to unstable cluster behavior. - See: https://access.redhat.com/solutions/4165791 - ``` - - Exit with error code - - Note: OperatorGroup will be automatically deleted with the namespace - -11. **Post-Uninstall Verification**: - - Verify all resources are cleaned up: - ```bash - oc get subscription,csv,installplan -n {namespace} --ignore-not-found - ``` - - Check if any CRDs remain (if they were supposed to be deleted): - ```bash - oc get crd | grep - ``` - - If uninstalling without `--remove-namespace`, check namespace is clean: - ```bash - oc get all -n {namespace} - ``` - - Display any remaining resources with suggestions for cleanup - -12. **Display Uninstallation Summary**: - - Show what was successfully removed: - ``` - ✓ Uninstallation Summary: - ✓ Subscription '{operator-name}' deleted - ✓ CSV '{csv-name}' deleted - ✓ Operator deployments removed - [✓ X custom resources deleted] - [✓ Y CRDs removed] - [✓ Namespace '{namespace}' deleted] - ``` - - If CRDs or namespace were NOT removed, provide instructions: - ``` - Note: The following resources were NOT removed: - - Custom Resource Definitions (use --remove-crds to remove) - - Namespace {namespace} (use --remove-namespace to remove) - - To completely remove all operator resources, run: - /olm:uninstall {operator-name} {namespace} --remove-crds --remove-namespace - ``` - - **Important warning about reinstallation**: - ``` - IMPORTANT: Before reinstalling this operator, verify all resources are cleaned: - - oc get subscription,csv,installplan -n {namespace} - oc get crd | grep - - Failure to completely uninstall may cause reinstallation issues. - See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - ``` - -## Return Value -- **Success**: Operator uninstalled successfully with summary of removed resources -- **Partial Success**: Some resources removed with warnings about remaining resources -- **Error**: Uninstallation failed with specific error message -- **Format**: Structured output showing: - - Subscription deletion status - - CSV deletion status - - Deployment removal status - - CRD removal status (if applicable) - - Namespace deletion status (if applicable) - -## Examples - -1. **Uninstall cert-manager-operator (basic)**: - ``` - /olm:uninstall openshift-cert-manager-operator - ``` - -2. **Uninstall with custom namespace**: - ``` - /olm:uninstall openshift-cert-manager-operator my-cert-manager - ``` - -3. **Complete cleanup including namespace**: - ``` - /olm:uninstall openshift-cert-manager-operator cert-manager-operator --remove-crds --remove-namespace - ``` - This performs a complete cleanup of all operator-related resources. - -4. **Force uninstall without prompts**: - ``` - /olm:uninstall openshift-cert-manager-operator cert-manager-operator --force - ``` - Skips all confirmation prompts (use with caution!). - -## Arguments -- **$1** (operator-name): The name of the operator to uninstall (required) - - Example: "openshift-cert-manager-operator" - - Must match the Subscription name -- **$2** (namespace): The namespace where operator is installed (optional) - - Default: `{operator-name}` (operator name without "openshift-" prefix) - - Example: "cert-manager-operator" -- **$3+** (flags): Optional flags (can combine multiple): - - `--remove-crds`: Remove Custom Resource Definitions (WARNING: affects entire cluster) - - `--remove-namespace`: Remove the operator's namespace and all its resources - - `--force`: Skip all confirmation prompts (use with caution) - -## Safety Features - -1. **Multiple Confirmations**: Separate confirmations for CRD and namespace removal -2. **Detailed Warnings**: Clear warnings about the scope of deletions -3. **Verification Steps**: Checks that resources exist before attempting deletion -4. **Summary Report**: Detailed summary of what was and wasn't removed -5. **Graceful Failures**: Continues with remaining steps if individual deletions fail - -## Troubleshooting - -- **Subscription not found**: Verify the operator name and namespace: - ```bash - oc get subscriptions --all-namespaces | grep {operator-name} - ``` -- **CSV won't delete**: Check for finalizers: - ```bash - oc get csv {csv-name} -n {namespace} -o yaml | grep finalizers - ``` - If finalizers are present, they may be waiting for resources to be cleaned up. Check operator logs and events. - -- **Namespace stuck in Terminating**: This is a common issue after operator uninstallation. - ```bash - # Find remaining resources - oc api-resources --verbs=list --namespaced -o name | \ - xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} - - # Check namespace finalizers - oc get namespace {namespace} -o yaml | grep -A5 finalizers - ``` - **IMPORTANT**: Do not force-delete the namespace. This can cause cluster instability. - Instead, use `/olm:diagnose {operator-name} {namespace}` to diagnose and fix the issue. - -- **CRDs won't delete**: Check for remaining custom resources: - ```bash - oc get {crd-name} --all-namespaces - ``` - CRDs cannot be deleted while CR instances exist. Delete all CRs first. - -- **Custom resources won't delete**: Some CRs may have finalizers preventing deletion: - ```bash - oc get -n -o yaml | grep finalizers - ``` - The operator controller (if still running) should remove finalizers. If operator is already deleted, you may need to manually patch the CR to remove finalizers (use with extreme caution). - -- **Permission denied**: Ensure you have cluster-admin privileges for CRD deletion: - ```bash - oc auth can-i delete crd - ``` - -- **Reinstallation fails after uninstall**: This usually means cleanup was incomplete. - Run these checks before reinstalling: - ```bash - # Check for remaining subscriptions/CSVs - oc get subscription,csv -n {namespace} - - # Check for remaining CRDs - oc get crd | grep - - # Check if namespace is clean or stuck - oc get namespace {namespace} - ``` - See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues - -## Related Commands - -- `/olm:install` - Install a day-2 operator -- `/olm:list` - List installed operators -- `/olm:status` - Check operator status before uninstalling -- `/olm:diagnose` - Diagnose and fix uninstallation issues -- `/olm:upgrade` - Upgrade an operator - -## Additional Resources - -- [Red Hat OpenShift: Deleting Operators from a cluster](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-deleting-operators-from-a-cluster) -- [Red Hat OpenShift: Reinstalling Operators after failed uninstallation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - diff --git a/plugins/olm/commands/upgrade.md b/plugins/olm/commands/upgrade.md deleted file mode 100644 index 75434f615..000000000 --- a/plugins/olm/commands/upgrade.md +++ /dev/null @@ -1,349 +0,0 @@ ---- -description: Update an operator to the latest version or switch channels -argument-hint: [namespace] [--channel=] [--approve] ---- - -## Name -olm:upgrade - -## Synopsis -``` -/olm:upgrade [namespace] [--channel=] [--approve] -``` - -## Description -The `olm:upgrade` command updates an installed operator to the latest version in its current channel or switches to a different channel. It can also approve pending InstallPlans for operators with manual approval mode. - -This command helps you: -- Update operators to the latest version in their channel -- Switch operators to different channels (e.g., stable to tech-preview) -- Approve pending upgrade InstallPlans for manual approval mode -- Monitor upgrade progress -- Rollback on failure (if possible via OLM) - -## Implementation - -The command performs the following steps: - -1. **Parse Arguments**: - - `$1`: Operator name (required) - Name of the operator to upgrade - - `$2`: Namespace (optional) - Namespace where operator is installed - - If not provided, searches for the operator across all namespaces - - `$3+`: Flags (optional): - - `--channel=`: Switch to a different channel - - `--approve`: Automatically approve pending InstallPlan (for manual approval mode) - -2. **Prerequisites Check**: - - Verify `oc` CLI is installed: `which oc` - - Verify cluster access: `oc whoami` - - Check if user has sufficient privileges - -3. **Locate Operator**: - - If namespace provided, verify operator exists: - ```bash - oc get subscription {operator-name} -n {namespace} --ignore-not-found - ``` - - If no namespace provided, search across all namespaces: - ```bash - oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' - ``` - - If not found, display error with suggestions - - If multiple instances found, prompt user to specify namespace - -4. **Get Current State**: - - Get current Subscription: - ```bash - oc get subscription {operator-name} -n {namespace} -o json - ``` - - Extract: - - Current channel: `.spec.channel` - - Install plan approval: `.spec.installPlanApproval` - - Installed CSV: `.status.installedCSV` - - Current CSV: `.status.currentCSV` - - Get current CSV version: - ```bash - oc get csv {installed-csv} -n {namespace} -o jsonpath='{.spec.version}' - ``` - -5. **Check for Available Updates**: - - Get PackageManifest: - ```bash - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - - Extract available channels and their latest versions - - If `--channel` flag is specified, verify channel exists - - If no channel flag, check for updates in current channel - - Compare current version with latest available version - - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators - -6. **Display Upgrade Plan**: - ``` - Operator Upgrade Plan: - - Operator: {display-name} - Namespace: {namespace} - Current Version: {current-version} - Current Channel: {current-channel} - - [If switching channels:] - Target Channel: {new-channel} - Target Version: {new-version} - - [If upgrading in same channel:] - Latest Version: {latest-version} (in channel: {current-channel}) - - Approval Mode: {Automatic|Manual} - ``` - -7. **Check for Pending InstallPlans** (for manual approval mode): - - Get pending InstallPlans: - ```bash - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - ``` - - If pending InstallPlan exists and `--approve` flag is set: - - Display InstallPlan details - - Approve the InstallPlan (skip to step 9) - - If pending InstallPlan exists and no `--approve` flag: - ``` - ⏸️ Pending InstallPlan found (requires manual approval) - - InstallPlan: {installplan-name} - Target Version: {target-version} - - To approve: /olm:upgrade {operator-name} {namespace} --approve - Or use: /olm:approve {operator-name} {namespace} - ``` - - Exit, waiting for user to approve - -8. **Perform Channel Switch** (if `--channel` flag provided): - - Confirm with user (unless `--force` flag): - ``` - WARNING: Switching channels may upgrade or downgrade the operator. - - Current: {current-channel} ({current-version}) - Target: {new-channel} ({target-version}) - - Continue? (yes/no) - ``` - - Update Subscription to new channel: - ```bash - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{new-channel}"}}' - ``` - - Display confirmation: - ``` - ✓ Subscription updated to channel: {new-channel} - ``` - -9. **Approve Pending InstallPlan** (if `--approve` flag or automatic approval): - - If approval mode is Manual and `--approve` flag is set: - ```bash - oc patch installplan {installplan-name} -n {namespace} \ - --type merge --patch '{"spec":{"approved":true}}' - ``` - - Display approval confirmation: - ``` - ✓ InstallPlan approved: {installplan-name} - ``` - -10. **Monitor Upgrade Progress**: - - Wait for new InstallPlan to be created (if switching channels): - ```bash - oc get installplan -n {namespace} -w --timeout=60s - ``` - - Wait for new CSV to reach "Succeeded" phase: - ```bash - oc get csv -n {namespace} -w --timeout=300s - ``` - - Display progress updates: - ``` - 🔄 Upgrade in progress... - ⏳ Waiting for InstallPlan to complete... - ⏳ New CSV installing: {new-csv-name} - ⏳ Old CSV replacing: {old-csv-name} - ``` - - Poll every 10 seconds to check status - - Timeout: 10 minutes for upgrade to complete - -11. **Verify Upgrade Success**: - - Check new CSV status: - ```bash - oc get csv -n {namespace} -o json - ``` - - Verify new CSV phase is "Succeeded" - - Get new version: - ```bash - oc get csv {new-csv-name} -n {namespace} -o jsonpath='{.spec.version}' - ``` - - Check deployments are healthy: - ```bash - oc get deployments -n {namespace} - ``` - - Check pods are running: - ```bash - oc get pods -n {namespace} - ``` - -12. **Display Upgrade Summary**: - ``` - ✓ Operator Upgrade Complete! - - Operator: {display-name} - Namespace: {namespace} - Previous Version: {old-version} - Current Version: {new-version} - Channel: {channel} - - Deployment Status: - - {deployment-1}: 1/1 replicas ready - - {deployment-2}: 1/1 replicas ready - - To check status: /olm:status {operator-name} {namespace} - ``` - -13. **Handle Upgrade Failures**: - - If upgrade fails or times out: - ``` - ❌ Operator upgrade failed - - Current State: - - CSV: {csv-name} (Phase: {phase}) - - Message: {error-message} - - Troubleshooting steps: - 1. Check CSV status: oc describe csv {csv-name} -n {namespace} - 2. Check events: oc get events -n {namespace} --sort-by='.lastTimestamp' - 3. Check InstallPlan: oc get installplan -n {namespace} - 4. Run diagnostics: /olm:diagnose {operator-name} {namespace} - - To rollback (if OLM supports): - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{old-channel}"}}' - ``` - -## Return Value -- **Success**: Operator upgraded successfully with new version details -- **Pending Approval**: Upgrade waiting for manual approval with instructions -- **No Update Available**: Operator is already at the latest version -- **Error**: Upgrade failed with specific error message and troubleshooting guidance -- **Format**: Structured output showing: - - Previous and current versions - - Channel information - - Deployment and pod status - - Next steps or related commands - -## Examples - -1. **Check for and install updates in current channel**: - ``` - /olm:upgrade openshift-cert-manager-operator - ``` - -2. **Upgrade with specific namespace**: - ``` - /olm:upgrade external-secrets-operator eso-operator - ``` - -3. **Switch to a different channel**: - ``` - /olm:upgrade openshift-cert-manager-operator cert-manager-operator --channel=tech-preview-v1.14 - ``` - This switches from stable-v1 to tech-preview-v1.14 channel. - -4. **Approve pending upgrade (manual approval mode)**: - ``` - /olm:upgrade openshift-cert-manager-operator --approve - ``` - -5. **Switch channel and approve in one command**: - ``` - /olm:upgrade prometheus prometheus-operator --channel=beta --approve - ``` - -## Arguments -- **$1** (operator-name): Name of the operator to upgrade (required) - - Example: "openshift-cert-manager-operator" - - Must match the operator's Subscription name -- **$2** (namespace): Namespace where operator is installed (optional) - - If not provided, searches all namespaces - - Example: "cert-manager-operator" -- **$3+** (flags): Optional flags - - `--channel=`: Switch to specified channel - - Example: `--channel=stable-v1`, `--channel=tech-preview` - - Triggers upgrade/downgrade to the version in that channel - - `--approve`: Automatically approve pending InstallPlan - - Only needed for operators with Manual approval mode - - Equivalent to `/olm:approve` command - -## Notes - -- **Automatic Updates**: Operators with `installPlanApproval: Automatic` will upgrade automatically when new versions are available in their channel -- **Manual Approval**: Operators with `installPlanApproval: Manual` require explicit approval via `--approve` flag or `/olm:approve` command -- **Channel Switching**: Changing channels may result in upgrade or downgrade depending on the versions in each channel -- **Rollback**: OLM has limited rollback support. Switching back to the previous channel may work, but data migration issues may occur -- **Upgrade Timing**: Upgrades happen according to the operator's upgrade strategy (some may cause downtime) - -## Troubleshooting - -- **No updates available**: - ```bash - # Check current version - oc get csv -n {namespace} - - # Check available versions - oc get packagemanifest {operator-name} -n openshift-marketplace -o json - ``` - -- **Upgrade stuck or pending**: - ```bash - # Check InstallPlan status - oc get installplan -n {namespace} - - # Check for events - oc get events -n {namespace} --sort-by='.lastTimestamp' | tail -20 - ``` - -- **Manual approval required**: - ```bash - # List pending InstallPlans - oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' - - # Approve specific InstallPlan - /olm:approve {operator-name} {namespace} - ``` - -- **Upgrade failed**: - ```bash - # Check CSV status - oc describe csv -n {namespace} - - # Check operator logs - oc logs -n {namespace} deployment/{operator-deployment} - - # Run diagnostics - /olm:diagnose {operator-name} {namespace} - ``` - -- **Rollback needed**: - - OLM doesn't have built-in rollback - - Can try switching back to previous channel, but may have issues: - ```bash - oc patch subscription {operator-name} -n {namespace} \ - --type merge --patch '{"spec":{"channel":"{old-channel}"}}' - ``` - - Consider backup/restore of custom resources before upgrading - -## Related Commands - -- `/olm:status ` - Check current version and available updates -- `/olm:approve ` - Approve pending InstallPlans -- `/olm:install ` - Install an operator -- `/olm:diagnose ` - Diagnose upgrade issues - -## Additional Resources - -- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) -- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) -- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) - - From 64d60c5e3e370fef55774b2cd98fcc6b11ab2211 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 6 May 2026 06:39:34 -0400 Subject: [PATCH 7/7] chore: process /save directive from pruning PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restored and added to .pruneprotect: - plugins/olm/ (all 6 commands) — saved by @stbenjam Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 2 +- .pruneprotect | 3 + PLUGINS.md | 6 + docs/data.json | 38 ++- plugins/olm/.claude-plugin/plugin.json | 2 +- plugins/olm/commands/approve.md | 305 +++++++++++++++++ plugins/olm/commands/catalog.md | 433 +++++++++++++++++++++++++ plugins/olm/commands/install.md | 272 ++++++++++++++++ plugins/olm/commands/opm.md | 359 ++++++++++++++++++++ plugins/olm/commands/uninstall.md | 392 ++++++++++++++++++++++ plugins/olm/commands/upgrade.md | 349 ++++++++++++++++++++ 11 files changed, 2158 insertions(+), 3 deletions(-) create mode 100644 plugins/olm/commands/approve.md create mode 100644 plugins/olm/commands/catalog.md create mode 100644 plugins/olm/commands/install.md create mode 100644 plugins/olm/commands/opm.md create mode 100644 plugins/olm/commands/uninstall.md create mode 100644 plugins/olm/commands/upgrade.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index d85e949e8..2e0ea0ca7 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -56,7 +56,7 @@ "name": "olm", "source": "./plugins/olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.2" + "version": "0.1.1" }, { "name": "olm-team", diff --git a/.pruneprotect b/.pruneprotect index 201631758..522387195 100644 --- a/.pruneprotect +++ b/.pruneprotect @@ -17,3 +17,6 @@ plugins/ci/ plugins/hcp/ plugins/sosreport/ +# Saved by @stbenjam on 2026-05-06 +plugins/olm/ + diff --git a/PLUGINS.md b/PLUGINS.md index b5464b582..6e7dafd34 100644 --- a/PLUGINS.md +++ b/PLUGINS.md @@ -189,11 +189,17 @@ See [plugins/must-gather/README.md](plugins/must-gather/README.md) for detailed OLM (Operator Lifecycle Manager) plugin for operator management and debugging **Commands:** +- **`/olm:approve` ` [namespace] [--all]`** - Approve pending InstallPlans for operator installations and upgrades +- **`/olm:catalog` ` [arguments]`** - Manage catalog sources for discovering and installing operators - **`/olm:debug` ` [olm-version]`** - Debug OLM issues using must-gather logs and source code analysis - **`/olm:diagnose` `[operator-name] [namespace] [--fix] [--cluster]`** - Diagnose and optionally fix common OLM and operator issues +- **`/olm:install` ` [namespace] [channel] [source] [--approval=Automatic|Manual]`** - Install a day-2 operator using Operator Lifecycle Manager - **`/olm:list` `[namespace] [--all-namespaces]`** - List installed operators in the cluster +- **`/olm:opm` ` [arguments...]`** - Execute opm (Operator Package Manager) commands for building and managing operator catalogs - **`/olm:search` `[query] [--catalog ]`** - Search for available operators in catalog sources - **`/olm:status` ` [namespace]`** - Get detailed status and health information for an operator +- **`/olm:uninstall` ` [namespace] [--remove-crds] [--remove-namespace]`** - Uninstall a day-2 operator and optionally remove its resources +- **`/olm:upgrade` ` [namespace] [--channel=] [--approve]`** - Update an operator to the latest version or switch channels See [plugins/olm/README.md](plugins/olm/README.md) for detailed documentation. diff --git a/docs/data.json b/docs/data.json index d93689f0d..028db3915 100644 --- a/docs/data.json +++ b/docs/data.json @@ -851,6 +851,18 @@ }, { "commands": [ + { + "argument_hint": " [namespace] [--all]", + "description": "Approve pending InstallPlans for operator installations and upgrades", + "name": "approve", + "synopsis": "/olm:approve [namespace] [--all]" + }, + { + "argument_hint": " [arguments]", + "description": "Manage catalog sources for discovering and installing operators", + "name": "catalog", + "synopsis": "/olm:catalog list" + }, { "argument_hint": " [olm-version]", "description": "Debug OLM issues using must-gather logs and source code analysis", @@ -863,12 +875,24 @@ "name": "diagnose", "synopsis": "/olm:diagnose [operator-name] [namespace] [--fix] [--cluster]" }, + { + "argument_hint": " [namespace] [channel] [source] [--approval=Automatic|Manual]", + "description": "Install a day-2 operator using Operator Lifecycle Manager", + "name": "install", + "synopsis": "/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual]" + }, { "argument_hint": "[namespace] [--all-namespaces]", "description": "List installed operators in the cluster", "name": "list", "synopsis": "/olm:list [namespace] [--all-namespaces]" }, + { + "argument_hint": " [arguments...]", + "description": "Execute opm (Operator Package Manager) commands for building and managing operator catalogs", + "name": "opm", + "synopsis": "/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=]" + }, { "argument_hint": "[query] [--catalog ]", "description": "Search for available operators in catalog sources", @@ -880,6 +904,18 @@ "description": "Get detailed status and health information for an operator", "name": "status", "synopsis": "/olm:status [namespace]" + }, + { + "argument_hint": " [namespace] [--remove-crds] [--remove-namespace]", + "description": "Uninstall a day-2 operator and optionally remove its resources", + "name": "uninstall", + "synopsis": "/olm:uninstall [namespace] [--remove-crds] [--remove-namespace]" + }, + { + "argument_hint": " [namespace] [--channel=] [--approve]", + "description": "Update an operator to the latest version or switch channels", + "name": "upgrade", + "synopsis": "/olm:upgrade [namespace] [--channel=] [--approve]" } ], "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", @@ -887,7 +923,7 @@ "hooks": [], "name": "olm", "skills": [], - "version": "0.1.2" + "version": "0.1.1" }, { "commands": [ diff --git a/plugins/olm/.claude-plugin/plugin.json b/plugins/olm/.claude-plugin/plugin.json index fb2925423..b6958e1ed 100644 --- a/plugins/olm/.claude-plugin/plugin.json +++ b/plugins/olm/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "olm", "description": "OLM (Operator Lifecycle Manager) plugin for operator management and debugging", - "version": "0.1.2", + "version": "0.1.1", "author": { "name": "github.com/openshift-eng" } diff --git a/plugins/olm/commands/approve.md b/plugins/olm/commands/approve.md new file mode 100644 index 000000000..3aa5437b0 --- /dev/null +++ b/plugins/olm/commands/approve.md @@ -0,0 +1,305 @@ +--- +description: Approve pending InstallPlans for operator installations and upgrades +argument-hint: [namespace] [--all] +--- + +## Name +olm:approve + +## Synopsis +``` +/olm:approve [namespace] [--all] +``` + +## Description +The `olm:approve` command approves pending InstallPlans for operators with manual approval mode. This is required for operators that have `installPlanApproval: Manual` in their Subscription to proceed with installation or upgrades. + +This command helps you: +- Approve operator installations that are waiting for manual approval +- Approve operator upgrades +- Review what will be installed/upgraded before approval +- Batch approve multiple pending InstallPlans + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - Name of the operator + - `$2`: Namespace (optional) - Namespace where operator is installed + - If not provided, searches for the operator across all namespaces + - `$3`: Flag (optional): + - `--all`: Approve all pending InstallPlans in the namespace + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has sufficient privileges + +3. **Locate Operator**: + - If namespace provided, verify operator exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If no namespace provided, search across all namespaces: + ```bash + oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' + ``` + - If not found, display error with suggestions + +4. **Check Subscription Approval Mode**: + - Get Subscription approval mode: + ```bash + oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' + ``` + - If mode is "Automatic", display informational message: + ``` + ℹ️ Operator '{operator-name}' has automatic approval enabled. + InstallPlans are approved automatically and don't require manual intervention. + + Current Subscription approval mode: Automatic + + To switch to manual approval mode: + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"installPlanApproval":"Manual"}}' + ``` + - Exit if automatic (no approval needed) + +5. **Find Pending InstallPlans**: + - Get all InstallPlans for the operator: + ```bash + oc get installplan -n {namespace} -o json + ``` + - Filter for unapproved plans related to this operator: + ```bash + oc get installplan -n {namespace} -o json | \ + jq '.items[] | select(.spec.approved==false and .spec.clusterServiceVersionNames[] | contains("{operator-name}"))' + ``` + - If no pending InstallPlans found: + ``` + ✓ No pending InstallPlans found for operator '{operator-name}' + + The operator is up to date or already approved. + + To check operator status: /olm:status {operator-name} {namespace} + ``` + - Exit with success + +6. **Display InstallPlan Details**: + For each pending InstallPlan, display: + ``` + ⏸️ Pending InstallPlan Found + + InstallPlan: {installplan-name} + Namespace: {namespace} + Phase: {phase} + Approved: false + + ClusterServiceVersions to be installed/upgraded: + - {csv-name-1} ({version-1}) + - {csv-name-2} ({version-2}) + + Resources to be created/updated: + - CustomResourceDefinitions: {crd-count} + - ServiceAccounts: {sa-count} + - ClusterRoles: {role-count} + - Deployments: {deployment-count} + + [If upgrade:] + Current Version: {current-version} + Target Version: {target-version} + ``` + +7. **Request User Confirmation** (unless `--all` or `--force` flag): + - Display confirmation prompt: + ``` + Do you want to approve this InstallPlan? (yes/no) + ``` + - If user says no, skip this InstallPlan + - If user says yes, proceed to approval + +8. **Approve InstallPlan**: + - Patch the InstallPlan to approve it: + ```bash + oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + ``` + - Verify approval: + ```bash + oc get installplan {installplan-name} -n {namespace} -o jsonpath='{.spec.approved}' + ``` + - Display confirmation: + ``` + ✓ InstallPlan approved: {installplan-name} + ``` + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + +9. **Monitor InstallPlan Execution** (optional): + - Watch InstallPlan phase change to "Complete": + ```bash + oc get installplan {installplan-name} -n {namespace} -w --timeout=120s + ``` + - Display progress: + ``` + 🔄 InstallPlan executing... + ⏳ Installing resources... + ``` + +10. **Verify Installation/Upgrade**: + - Wait for CSV to reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -o json | \ + jq -r '.items[] | select(.status.phase=="Succeeded") | .metadata.name' + ``` + - Display result: + ``` + ✓ Operator installation/upgrade complete + + CSV: {csv-name} + Version: {version} + Phase: Succeeded + + To check operator status: /olm:status {operator-name} {namespace} + ``` + +11. **Handle Multiple InstallPlans** (if `--all` flag): + - Process all pending InstallPlans for the operator + - Display summary: + ``` + ✓ Approved {count} InstallPlan(s) + + Approved: + - {installplan-1} + - {installplan-2} + + Monitoring installation progress... + ``` + +12. **Display Approval Summary**: + ``` + ✓ Approval Complete! + + Operator: {operator-name} + Namespace: {namespace} + Approved InstallPlans: {count} + + InstallPlan Status: + - {installplan-1}: Complete + - {installplan-2}: Installing... + + Monitor progress: watch oc get csv,installplan -n {namespace} + ``` + +## Return Value +- **Success**: InstallPlan(s) approved successfully +- **No Pending Plans**: No InstallPlans require approval +- **Automatic Mode**: Operator has automatic approval (no action needed) +- **Error**: Approval failed with specific error message +- **Format**: Structured output showing: + - Approved InstallPlan names + - Installation/upgrade status + - Next steps or related commands + +## Examples + +1. **Approve pending InstallPlan for an operator**: + ``` + /olm:approve openshift-cert-manager-operator + ``` + +2. **Approve with specific namespace**: + ``` + /olm:approve external-secrets-operator eso-operator + ``` + +3. **Approve all pending InstallPlans**: + ``` + /olm:approve openshift-cert-manager-operator cert-manager-operator --all + ``` + This approves all pending InstallPlans for the operator in the namespace. + +4. **Check and approve after upgrade command**: + ``` + /olm:upgrade openshift-cert-manager-operator --channel=tech-preview + # Wait for InstallPlan to be created + /olm:approve openshift-cert-manager-operator + ``` + +## Arguments +- **$1** (operator-name): Name of the operator (required) + - Example: "openshift-cert-manager-operator" + - Must match the operator's Subscription name +- **$2** (namespace): Namespace where operator is installed (optional) + - If not provided, searches all namespaces + - Example: "cert-manager-operator" +- **$3** (flag): Optional flag + - `--all`: Approve all pending InstallPlans for this operator + - Useful when multiple upgrades are pending + - Skips individual confirmation prompts + +## Notes + +- **Manual Approval Mode**: This command only works for operators with `installPlanApproval: Manual` in their Subscription +- **Automatic Operators**: Operators with automatic approval don't need this command +- **Review Before Approval**: Always review what will be installed/upgraded before approving +- **Multiple InstallPlans**: An operator may have multiple pending InstallPlans if updates accumulated while waiting for approval +- **InstallPlan Retention**: Approved InstallPlans remain in the namespace for audit purposes + +## Troubleshooting + +- **No pending InstallPlans**: + ```bash + # List all InstallPlans + oc get installplan -n {namespace} + + # Check if operator is in automatic mode + oc get subscription {operator-name} -n {namespace} -o jsonpath='{.spec.installPlanApproval}' + ``` + +- **InstallPlan not executing after approval**: + ```bash + # Check InstallPlan status + oc describe installplan {installplan-name} -n {namespace} + + # Check for errors + oc get events -n {namespace} --sort-by='.lastTimestamp' | grep InstallPlan + ``` + +- **CSV not reaching Succeeded phase**: + ```bash + # Check CSV status + oc describe csv -n {namespace} + + # Check operator deployment + oc get deployments -n {namespace} + + # Check operator logs + oc logs -n {namespace} deployment/{operator-deployment} + ``` + +- **Permission denied**: + ```bash + # Check if you can patch InstallPlans + oc auth can-i patch installplan -n {namespace} + ``` + +- **Multiple namespaces found**: + - Specify the namespace explicitly in the command: + ``` + /olm:approve {operator-name} {specific-namespace} + ``` + +## Related Commands + +- `/olm:status ` - Check if InstallPlans are pending approval +- `/olm:upgrade ` - Trigger upgrade and approve in one command +- `/olm:install ` - Install operator with approval mode +- `/olm:list` - List operators and their approval modes + +## Additional Resources + +- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) +- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + + diff --git a/plugins/olm/commands/catalog.md b/plugins/olm/commands/catalog.md new file mode 100644 index 000000000..cd43964ea --- /dev/null +++ b/plugins/olm/commands/catalog.md @@ -0,0 +1,433 @@ +--- +description: Manage catalog sources for discovering and installing operators +argument-hint: [arguments] +--- + +## Name +olm:catalog + +## Synopsis +``` +/olm:catalog list +/olm:catalog add [--namespace=openshift-marketplace] +/olm:catalog remove [--namespace=openshift-marketplace] +/olm:catalog refresh [--namespace=openshift-marketplace] +/olm:catalog status [--namespace=openshift-marketplace] +``` + +## Description +The `olm:catalog` command manages catalog sources for operator discovery and installation. Catalog sources provide the list of operators available for installation in the cluster. + +This command helps you: +- List all available catalog sources and their health status +- Add custom or private catalog sources +- Remove catalog sources +- Refresh catalog sources to get latest operator updates + +## Implementation + +### Subcommand: list + +1. **Get All CatalogSources**: + ```bash + oc get catalogsource -n openshift-marketplace -o json + ``` + +2. **Parse CatalogSource Data**: + For each catalog, extract: + - Name: `.metadata.name` + - Display Name: `.spec.displayName` + - Publisher: `.spec.publisher` + - Source Type: `.spec.sourceType` (grpc, configmap, etc.) + - Image: `.spec.image` (for grpc type) + - Connection State: `.status.connectionState.lastObservedState` + - Last Updated: `.status.connectionState.lastUpdatedTime` + - Number of Operators: Count from PackageManifests with this catalog + +3. **Get Catalog Pod Status**: + ```bash + oc get pods -n openshift-marketplace -l olm.catalogSource={catalog-name} + ``` + +4. **Format Output**: + ``` + ═══════════════════════════════════════════════════════════ + CATALOG SOURCES + ═══════════════════════════════════════════════════════════ + + NAME STATUS OPERATORS LAST UPDATED SOURCE TYPE + redhat-operators READY 150 2h ago grpc + certified-operators READY 45 3h ago grpc + community-operators READY 200 1h ago grpc + redhat-marketplace READY 30 4h ago grpc + custom-catalog FAILED 0 - grpc + + ═══════════════════════════════════════════════════════════ + DETAILS + ═══════════════════════════════════════════════════════════ + + redhat-operators: + Display Name: Red Hat Operators + Publisher: Red Hat + Image: registry.redhat.io/redhat/redhat-operator-index:v4.20 + Pod: redhat-operators-abc123 (Running) + + custom-catalog (FAILED): + Display Name: Custom Catalog + Publisher: My Company + Image: registry.example.com/custom-catalog:latest + Pod: custom-catalog-xyz789 (CrashLoopBackOff) + Error: ImagePullBackOff + + To troubleshoot: + /olm:catalog status custom-catalog + ``` + +### Subcommand: add + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `image`: Catalog image (required) + - `--namespace`: Target namespace (default: openshift-marketplace) + - `--display-name`: Display name (optional) + - `--publisher`: Publisher name (optional) + +2. **Validate Image**: + - Check if image format is valid + - Optionally test image accessibility (if possible) + +3. **Create CatalogSource Manifest**: + ```yaml + apiVersion: operators.coreos.com/v1alpha1 + kind: CatalogSource + metadata: + name: {name} + namespace: {namespace} + spec: + sourceType: grpc + image: {image} + displayName: {display-name} + publisher: {publisher} + updateStrategy: + registryPoll: + interval: 30m + ``` + +4. **Apply CatalogSource**: + ```bash + oc apply -f /tmp/catalogsource-{name}.yaml + ``` + +5. **Wait for CatalogSource to be Ready**: + ```bash + oc wait --for=condition=READY catalogsource/{name} -n {namespace} --timeout=300s + ``` + +6. **Verify Pod is Running**: + ```bash + oc get pods -n {namespace} -l olm.catalogSource={name} + ``` + +7. **Display Result**: + ``` + ✓ Catalog source added: {name} + + Name: {name} + Namespace: {namespace} + Image: {image} + Status: READY + Pod: {pod-name} (Running) + + To search operators: /olm:search --catalog {name} + ``` + +### Subcommand: remove + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Check if CatalogSource Exists**: + ```bash + oc get catalogsource {name} -n {namespace} --ignore-not-found + ``` + +3. **Check for Operators Using This Catalog**: + ```bash + oc get subscription --all-namespaces -o json | \ + jq -r '.items[] | select(.spec.source=="{name}") | "\(.metadata.namespace)/\(.metadata.name)"' + ``` + +4. **Display Warning** (if operators found): + ``` + WARNING: The following operators are using this catalog: + - namespace-1/operator-1 + - namespace-2/operator-2 + + Removing this catalog will prevent these operators from receiving updates. + + Do you want to continue? (yes/no) + ``` + +5. **Delete CatalogSource**: + ```bash + oc delete catalogsource {name} -n {namespace} + ``` + +6. **Wait for Pod to be Deleted**: + ```bash + oc wait --for=delete pod -l olm.catalogSource={name} -n {namespace} --timeout=60s + ``` + +7. **Display Result**: + ``` + ✓ Catalog source removed: {name} + ``` + +### Subcommand: refresh + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Get Current CatalogSource**: + ```bash + oc get catalogsource {name} -n {namespace} -o json + ``` + +3. **Trigger Refresh by Deleting Pod**: + ```bash + oc delete pod -n {namespace} -l olm.catalogSource={name} + ``` + - This forces OLM to recreate the pod and re-fetch catalog data + +4. **Wait for New Pod to be Ready**: + ```bash + oc wait --for=condition=Ready pod -l olm.catalogSource={name} -n {namespace} --timeout=300s + ``` + +5. **Verify Catalog is Updated**: + ```bash + oc get catalogsource {name} -n {namespace} -o json | \ + jq -r '.status.connectionState.lastUpdatedTime' + ``` + +6. **Display Result**: + ``` + ✓ Catalog source refreshed: {name} + + Last Updated: {timestamp} + Status: READY + Pod: {pod-name} (Running) + + New operators may now be available: /olm:search --catalog {name} + ``` + +### Subcommand: status + +1. **Parse Arguments**: + - `name`: Catalog source name (required) + - `--namespace`: Namespace (default: openshift-marketplace) + +2. **Get CatalogSource Details**: + ```bash + oc get catalogsource {name} -n {namespace} -o json + ``` + +3. **Get Pod Details**: + ```bash + oc get pods -n {namespace} -l olm.catalogSource={name} -o json + ``` + +4. **Get Recent Events**: + ```bash + oc get events -n {namespace} --field-selector involvedObject.name={name} --sort-by='.lastTimestamp' + ``` + +5. **Count Available Operators**: + ```bash + oc get packagemanifests -n openshift-marketplace -o json | \ + jq -r '.items[] | select(.status.catalogSource=="{name}") | .metadata.name' | wc -l + ``` + +6. **Verify Catalog Connectivity**: + - Check if catalog is serving content by verifying PackageManifest count > 0 + - If count is 0 but pod is Running, indicates connectivity or catalog index issues + - Review catalog pod logs for gRPC errors, image pull issues, or index corruption: + ```bash + oc logs -n {namespace} {catalog-pod-name} + ``` + +7. **Format Comprehensive Status Report**: + ``` + ═══════════════════════════════════════════════════════════ + CATALOG SOURCE STATUS: {name} + ═══════════════════════════════════════════════════════════ + + General Information: + Name: {name} + Namespace: {namespace} + Display Name: {display-name} + Publisher: {publisher} + Source Type: {source-type} + Image: {image} + + Connection Status: + State: {state} (READY | CONNECTING | CONNECTION_FAILED) + Last Updated: {timestamp} + Last Successful: {timestamp} + + Pod Status: + Name: {pod-name} + Status: {status} (Running | CrashLoopBackOff | ImagePullBackOff) + Ready: {ready-containers}/{total-containers} + Restarts: {restart-count} + Age: {age} + + Catalog Content: + Operators Available: {count} + + [If issues detected:] + ⚠️ Issues Detected: + - Pod in CrashLoopBackOff + - Last update: 24h ago (stale) + - Connection state: CONNECTION_FAILED + + Recent Events: + {timestamp} Warning: Failed to pull image + {timestamp} Warning: Back-off restarting failed container + + Troubleshooting Steps: + 1. Check pod logs: oc logs -n {namespace} {pod-name} + 2. Check image accessibility + 3. Refresh catalog: /olm:catalog refresh {name} + 4. Verify network connectivity (for disconnected environments) + + Related Commands: + - Refresh: /olm:catalog refresh {name} + - List operators: /olm:search --catalog {name} + ``` + +## Return Value +- **list**: Table of all catalog sources with status +- **add**: Confirmation of added catalog with details +- **remove**: Confirmation of removed catalog +- **refresh**: Confirmation of refresh with updated timestamp +- **status**: Comprehensive status report for specific catalog + +## Examples + +1. **List all catalog sources**: + ``` + /olm:catalog list + ``` + +2. **Add custom catalog**: + ``` + /olm:catalog add my-catalog registry.example.com/my-catalog:v1.0 + ``` + +3. **Add catalog with metadata**: + ``` + /olm:catalog add my-catalog registry.example.com/catalog:latest \ + --display-name="My Custom Catalog" \ + --publisher="My Company" + ``` + +4. **Remove catalog**: + ``` + /olm:catalog remove my-catalog + ``` + +5. **Refresh catalog to get latest operators**: + ``` + /olm:catalog refresh redhat-operators + ``` + +6. **Check catalog health**: + ``` + /olm:catalog status custom-catalog + ``` + +7. **Add catalog for disconnected environment**: + ``` + /olm:catalog add disconnected-operators \ + mirror-registry.local:5000/olm/redhat-operators:v4.20 \ + --namespace=openshift-marketplace + ``` + +## Arguments + +### list +No arguments required. + +### add +- **name** (required): Name for the catalog source +- **image** (required): Container image containing the catalog +- **--namespace**: Target namespace (default: openshift-marketplace) +- **--display-name**: Human-readable display name +- **--publisher**: Publisher/organization name + +### remove +- **name** (required): Name of the catalog source to remove +- **--namespace**: Namespace (default: openshift-marketplace) + +### refresh +- **name** (required): Name of the catalog source to refresh +- **--namespace**: Namespace (default: openshift-marketplace) + +### status +- **name** (required): Name of the catalog source to check +- **--namespace**: Namespace (default: openshift-marketplace) + +## Troubleshooting + +- **Catalog pod failing**: + ```bash + # Check pod logs + oc logs -n openshift-marketplace {catalog-pod-name} + + # Check image pull issues + oc describe pod -n openshift-marketplace {catalog-pod-name} + ``` + +- **No operators showing up**: + ```bash + # Verify catalog is ready + /olm:catalog status {catalog-name} + + # Check PackageManifests + oc get packagemanifests -n openshift-marketplace + ``` + +- **Image pull errors (disconnected environment)**: + - Verify image registry is accessible + - Check pull secrets are configured + - Ensure image has been mirrored correctly + +- **Stale catalog data**: + ```bash + # Force refresh + /olm:catalog refresh {catalog-name} + ``` + +- **Connection failures**: + ```bash + # Check catalog source definition + oc get catalogsource {catalog-name} -n openshift-marketplace -o yaml + + # Run cluster diagnostics + /olm:diagnose --cluster + ``` + +## Related Commands + +- `/olm:search` - Search for operators in catalogs +- `/olm:install` - Install operators from catalogs +- `/olm:diagnose` - Diagnose catalog health issues + +## Additional Resources +- [Building Catalog Images with opm](https://olm.operatorframework.io/docs/tasks/creating-catalog-from-index/) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + + diff --git a/plugins/olm/commands/install.md b/plugins/olm/commands/install.md new file mode 100644 index 000000000..ccc0bcfc2 --- /dev/null +++ b/plugins/olm/commands/install.md @@ -0,0 +1,272 @@ +--- +description: Install a day-2 operator using Operator Lifecycle Manager +argument-hint: [namespace] [channel] [source] [--approval=Automatic|Manual] +--- + +## Name +olm:install + +## Synopsis +``` +/olm:install [namespace] [channel] [source] [--approval=Automatic|Manual] +``` + +## Description +The `olm:install` command installs a day-2 operator in an OpenShift cluster using Operator Lifecycle Manager (OLM). It automates the creation of the required namespace, OperatorGroup, and Subscription resources needed to install an operator. + +This command handles the complete operator installation workflow: +- Creates or verifies the target namespace exists +- Creates an OperatorGroup if needed +- Creates a Subscription to install the operator +- Verifies the installation by checking the operator's CSV (ClusterServiceVersion) status +- Provides detailed feedback on the installation progress + +The command is designed to work with operators from the OperatorHub catalog, including Red Hat certified operators, community operators, and custom catalog sources. + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - The name of the operator to install (e.g., "openshift-cert-manager-operator") + - `$2`: Namespace (optional) - Target namespace for the operator. If not provided, defaults to `{operator-name}-operator` (e.g., "cert-manager-operator") + - `$3`: Channel (optional) - Subscription channel. If not provided, discovers the default channel from the operator's PackageManifest + - `$4`: Source (optional) - CatalogSource name. Defaults to "redhat-operators" for Red Hat operators + - `$5+`: Flags (optional): + - `--approval=Automatic|Manual`: InstallPlan approval mode (default: Automatic) + - Automatic: Operator upgrades are automatically installed + - Manual: Operator upgrades require manual approval via `/olm:approve` or `oc patch` + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has cluster-admin or sufficient privileges + - If not installed or not authenticated, provide clear instructions + +3. **Discover Operator Metadata** (if channel or source not provided): + - Search for the operator in available catalogs: + ```bash + oc get packagemanifests -n openshift-marketplace | grep {operator-name} + ``` + - Get the PackageManifest details: + ```bash + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + - Extract: + - Default channel: `.status.defaultChannel` + - CatalogSource: `.status.catalogSource` + - CatalogSourceNamespace: `.status.catalogSourceNamespace` + - If operator not found, provide error with list of available operators + +4. **Create Namespace**: + - Check if namespace exists: `oc get namespace {namespace} --ignore-not-found` + - If not exists, create it: + ```bash + oc create namespace {namespace} + ``` + - If exists, inform user and continue + +5. **Create OperatorGroup**: + - Check if OperatorGroup exists in the namespace: + ```bash + oc get operatorgroup -n {namespace} --ignore-not-found + ``` + - If no OperatorGroup exists, create one: + ```yaml + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: {namespace}-operatorgroup + namespace: {namespace} + spec: + targetNamespaces: + - {namespace} + ``` + - Save to temporary file and apply: + ```bash + oc apply -f /tmp/operatorgroup-{operator-name}.yaml + ``` + - If OperatorGroup already exists, inform user and continue + +6. **Create Subscription**: + - Parse approval mode from flags (default: Automatic) + - Create Subscription manifest: + ```yaml + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: {operator-name} + namespace: {namespace} + spec: + channel: {channel} + name: {operator-name} + source: {source} + sourceNamespace: openshift-marketplace + installPlanApproval: {Automatic|Manual} + ``` + - Save to temporary file and apply: + ```bash + oc apply -f /tmp/subscription-{operator-name}.yaml + ``` + - Display the created subscription details + - If approval mode is Manual, display informational message: + ``` + ℹ️ InstallPlan approval set to Manual + You will need to manually approve InstallPlans for this operator. + Use: /olm:approve {operator-name} {namespace} + + Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + ``` + +7. **Verify Installation**: + - Wait for InstallPlan to be created: + ```bash + oc get installplan -n {namespace} -l operators.coreos.com/operator={operator-name} + ``` + - If approval mode is Manual, check if InstallPlan needs approval: + ```bash + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + ``` + - If Manual and not approved, display message: + ``` + ⏸️ InstallPlan created but requires manual approval + + InstallPlan: {installplan-name} + To approve: /olm:approve {operator-name} {namespace} + Or manually: oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + + Waiting for approval... + ``` + - Wait for CSV to be created and reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -w + ``` + - Use a timeout of 5 minutes for the installation to complete (10 minutes if Manual approval) + - Poll every 10 seconds to check CSV status + - Display progress updates to the user + +8. **Display Results**: + - Show the installed operator's CSV name and version + - Show the operator deployment status: + ```bash + oc get deployments -n {namespace} + ``` + - List any pods created by the operator: + ```bash + oc get pods -n {namespace} + ``` + - Display success message with next steps or usage instructions + +9. **Cleanup Temporary Files**: + - Remove temporary YAML files created during installation: + ```bash + rm -f /tmp/operatorgroup-{operator-name}.yaml /tmp/subscription-{operator-name}.yaml + ``` + +## Return Value +- **Success**: Operator installed successfully with details about the CSV, deployments, and pods +- **Error**: Installation failed with specific error message and troubleshooting suggestions +- **Format**: Structured output showing: + - Namespace created/used + - OperatorGroup status + - Subscription created + - CSV status and version + - Deployment and pod status + +## Examples + +1. **Install cert-manager-operator with defaults**: + ``` + /olm:install openshift-cert-manager-operator + ``` + This will: + - Create namespace `cert-manager-operator` + - Discover default channel from PackageManifest + - Use `redhat-operators` catalog source + - Install the operator + +2. **Install cert-manager-operator with custom namespace**: + ``` + /olm:install openshift-cert-manager-operator my-cert-manager + ``` + This will install the operator in the `my-cert-manager` namespace. + +3. **Install with specific channel**: + ``` + /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 + ``` + This will install from the `stable-v1` channel. + +4. **Install from community catalog**: + ``` + /olm:install prometheus community-operators stable community-operators + ``` + This will install Prometheus from the community-operators catalog. + +5. **Install Red Hat Advanced Cluster Security**: + ``` + /olm:install rhacs-operator rhacs-operator stable + ``` + +6. **Install with manual approval mode**: + ``` + /olm:install openshift-cert-manager-operator cert-manager-operator stable-v1 redhat-operators --approval=Manual + ``` + This will install the operator but require manual approval for all upgrades. + +7. **Install with all parameters specified**: + ``` + /olm:install external-secrets-operator eso-operator stable-v0.10 redhat-operators --approval=Automatic + ``` + +## Arguments +- **$1** (operator-name): The name of the operator to install (required) + - Example: "openshift-cert-manager-operator" + - Must match the name in the operator's PackageManifest +- **$2** (namespace): Target namespace for the operator installation (optional) + - Default: `{operator-name}` (operator name without "openshift-" prefix if present) + - Example: "cert-manager-operator" +- **$3** (channel): Subscription channel (optional) + - Default: Auto-discovered from PackageManifest's default channel + - Example: "stable-v1", "tech-preview", "stable" +- **$4** (source): CatalogSource name (optional) + - Default: "redhat-operators" + - Other options: "certified-operators", "community-operators", "redhat-marketplace" +- **$5+** (flags): Optional flags + - `--approval=Automatic|Manual`: InstallPlan approval mode + - **Automatic** (default): Operator upgrades are automatically installed without user intervention + - **Manual**: Operator upgrades require explicit approval. Useful for: + - Production environments requiring change control + - Testing upgrades before applying + - Preventing unexpected operator updates + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators + +## Notes + +- **Automatic Channel Discovery**: If no channel is specified, the command automatically discovers and uses the operator's default channel from its PackageManifest +- **Namespace Convention**: By default, operators are installed in a namespace following the pattern `{operator-name}-operator` +- **OperatorGroup Scope**: The created OperatorGroup targets only the installation namespace for better isolation +- **InstallPlan Approval**: Set to "Automatic" by default for seamless installation. Can be changed to "Manual" using `--approval=Manual` flag +- **Manual Approval Mode**: When using `--approval=Manual`: + - Initial installation may require manual approval of the InstallPlan + - All future upgrades will require explicit approval via `/olm:approve` command + - Provides better control over operator updates in production environments +- **Verification Timeout**: The command waits up to 5 minutes for the operator to install successfully (10 minutes for manual approval mode) +- **Cleanup**: Temporary YAML files are automatically removed after installation + +## Troubleshooting + +- **Operator not found**: Run `oc get packagemanifests -n openshift-marketplace` to see available operators +- **Permission denied**: Ensure you have cluster-admin privileges or the necessary RBAC permissions +- **Installation timeout**: Check the InstallPlan and CSV status manually: + ```bash + oc get installplan -n {namespace} + oc get csv -n {namespace} + oc describe csv -n {namespace} + ``` +- **Operator pod not starting**: Check pod logs: + ```bash + oc logs -n {namespace} deployment/{operator-deployment} + ``` + diff --git a/plugins/olm/commands/opm.md b/plugins/olm/commands/opm.md new file mode 100644 index 000000000..bd30fe0de --- /dev/null +++ b/plugins/olm/commands/opm.md @@ -0,0 +1,359 @@ +--- +description: Execute opm (Operator Package Manager) commands for building and managing operator catalogs +argument-hint: [arguments...] +--- + +## Name +olm:opm + +## Synopsis +```bash +/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] +/olm:opm list packages +/olm:opm list channels [package-name] +/olm:opm list bundles [package-name] +``` + +## Description +The `olm:opm` command provides a unified interface to `opm` (Operator Package Manager) operations for building and managing operator catalog indexes. It supports building catalog indexes, generating semver templates, and querying catalog contents. + +## Arguments +- `$1`: **action** - The action to perform: + - `build-index-image`: Build an index from an existing catalog directory + - `build-semver-index-image`: Build an index from a semver template + - `generate-semver-template`: Generate a semver template file + - `list`: List catalog contents (requires second argument: `packages`, `channels`, or `bundles`) +- `$2+`: Additional arguments specific to each action (see Actions section below) + +## Actions + +### build-index-image +Build an operator catalog index image from an existing catalog directory. + +**Synopsis:** +```bash +/olm:opm build-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +``` + +**Arguments:** +- `$2`: **catalog-path** - Path to the catalog directory containing the index configuration +- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) +- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) +- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) +- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) +- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) + +**Examples:** +```bash +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --cacheless +/olm:opm build-index-image catalog quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 +``` + +### build-semver-index-image +Build a multi-architecture operator catalog index image using the semver template format. + +**Synopsis:** +```bash +/olm:opm build-semver-index-image [--cacheless] [--arch=] [--base-image=] [--builder-image=] +``` + +**Arguments:** +- `$2`: **semver-template-file** - Path to the semver template configuration file (e.g., `catalog-config.yaml`) +- `$3`: **index-image-tag** - Full image tag for the resulting index image (e.g., `quay.io/myorg/mycatalog:v1.0.0`) +- `--cacheless`: Optional flag to build a cacheless image (uses `scratch` as base image; `--base-image` and `--builder-image` are ignored when this is set) +- `--arch=`: Optional architecture specification (default: `multi` for multi-arch build; can specify single arch like `amd64`, `arm64`, `ppc64le`, `s390x`) +- `--base-image=`: Optional base image for the index (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) +- `--builder-image=`: Optional builder image (default: `quay.io/operator-framework/opm:latest`; ignored if `--cacheless` is set) + +**Examples:** +```bash +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --cacheless +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=amd64 +/olm:opm build-semver-index-image catalog-config.yaml quay.io/myorg/mycatalog:v1.0.0 --arch=multi +``` + +### generate-semver-template +Generate a semver template configuration file for building operator catalogs. + +**Synopsis:** +```bash +/olm:opm generate-semver-template [--output=] [--major=true|false] [--minor=true|false] +``` + +**Arguments:** +- `$2`: **bundle-list** - Comma-separated list of bundle image references (e.g., `quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1`) +- `--output=`: Optional output file path (default: `catalog-semver-config.yaml` in current directory) +- `--major=true|false`: Optional flag to generate major version channels (default: `true`) +- `--minor=true|false`: Optional flag to generate minor version channels (default: `false`) + +**Examples:** +```bash +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.0.1 --output=my-catalog.yaml +/olm:opm generate-semver-template quay.io/org/bundle:v1.0.0,quay.io/org/bundle:v1.1.0 --minor=true +``` + +### list packages +List all operator packages available in a catalog index. + +**Synopsis:** +```bash +/olm:opm list packages +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **packages** - Must be "packages" +- `$4`: **index-ref** - Catalog index reference, either: + - Image tag: `quay.io/myorg/mycatalog:v1.0.0` + - Directory path: `./catalog` or `/path/to/catalog` + +**Examples:** +```bash +/olm:opm list packages quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list packages ./catalog +``` + +### list channels +List channels for operator packages in a catalog index. + +**Synopsis:** +```bash +/olm:opm list channels [package-name] +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **channels** - Must be "channels" +- `$4`: **index-ref** - Catalog index reference (image tag or directory path) +- `$5`: **package-name** (Optional) - Name of a specific package to list channels for + +**Examples:** +```bash +/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list channels quay.io/olmqe/nginx8518-index-test:v1 nginx85187 +/olm:opm list channels ./catalog +``` + +### list bundles +List bundles for operator packages in a catalog index. + +**Synopsis:** +```bash +/olm:opm list bundles [package-name] +``` + +**Arguments:** +- `$2`: **list** - Must be "list" +- `$3`: **bundles** - Must be "bundles" +- `$4`: **index-ref** - Catalog index reference (image tag or directory path) +- `$5`: **package-name** (Optional) - Name of a specific package to list bundles for + +**Examples:** +```bash +/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 +/olm:opm list bundles quay.io/olmqe/nginx8518-index-test:v1 nginx85187 +/olm:opm list bundles ./catalog +``` + +## Implementation + +### Step 1: Parse Action +- Extract the action from `$1` +- Validate the action is one of: `build-index-image`, `build-semver-index-image`, `generate-semver-template`, `list` +- If invalid action, display error with available actions + +### Step 2: Check Prerequisites +Verify required tools are installed: +- Check for `opm`: `which opm` + - If not found, provide installation instructions: +- For build actions, also check for `podman`: `which podman` + - If not found, provide installation instructions based on user's platform + +### Step 3: Route to Action Handler +Based on the action, call the appropriate implementation: + +#### For `build-index-image`: +1. **Parse Arguments and Set Defaults** + - Extract catalog path from `$2` + - Extract index image tag from `$3` + - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` + - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` + +2. **Verify Catalog Directory** + - Check catalog directory exists: `test -d ` + +3. **Validate Catalog** + ```bash + opm validate + ``` + +4. **Generate Dockerfile** + - If cacheless: `opm generate dockerfile --base-image=scratch` + - If normal: `opm generate dockerfile -b -i ` + +5. **Determine Build Platform** + - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` + - Otherwise: `linux/` + +6. **Create Podman Manifest** + ```bash + podman manifest rm 2>/dev/null || true + podman manifest create + ``` + +7. **Build Image** + ```bash + podman build --platform --manifest . -f catalog.Dockerfile + ``` + +8. **Push Manifest** + ```bash + podman manifest push + ``` + +9. **List Bundles in Index** + ```bash + opm alpha list bundles + ``` + +10. **Display Success Message** + +#### For `build-semver-index-image`: +1. **Parse Arguments and Set Defaults** + - Extract semver template file from `$2` + - Extract index image tag from `$3` + - Parse optional flags: `--cacheless`, `--arch`, `--base-image`, `--builder-image` + - Set defaults: arch=`multi`, base-image=`quay.io/operator-framework/opm:latest`, builder-image=`quay.io/operator-framework/opm:latest` + +2. **Verify Template File** + - Check file exists: `test -f ` + +3. **Create Catalog and Render Template** + ```bash + mkdir -p catalog + opm alpha render-template semver -o yaml > catalog/index.yaml + ``` + +4. **Validate Catalog** + ```bash + opm validate catalog + ``` + +5. **Generate Dockerfile** + - If cacheless: `opm generate dockerfile catalog --base-image=scratch` + - If normal: `opm generate dockerfile catalog -b -i ` + +6. **Determine Build Platform** + - If arch=`multi`: `linux/amd64,linux/arm64,linux/ppc64le,linux/s390x` + - Otherwise: `linux/` + +7. **Create Podman Manifest** + ```bash + podman manifest rm 2>/dev/null || true + podman manifest create + ``` + +8. **Build Image** + ```bash + podman build --platform --manifest . -f catalog.Dockerfile + ``` + +9. **Push Manifest** + ```bash + podman manifest push + ``` + +10. **List Bundles in Index** + ```bash + opm alpha list bundles + ``` + +11. **Display Success Message** + +#### For `generate-semver-template`: +1. **Parse Arguments and Set Defaults** + - Extract bundle list from `$2` + - Parse optional flags: `--output`, `--major`, `--minor` + - Set defaults: output=`catalog-semver-config.yaml`, major=`true`, minor=`false` + +2. **Validate Bundle List** + - Split by commas + - Validate each bundle is a valid image reference + +3. **Generate YAML Content** + ```yaml + Schema: olm.semver + GenerateMajorChannels: + GenerateMinorChannels: + Candidate: + Bundles: + - Image: + - Image: + ``` + +4. **Write Template File** + - Check if file exists and confirm overwrite if needed + - Write YAML content + +5. **Validate Generated File** + - Read back and verify YAML is well-formed + +6. **Display Success Message** + - Show file path, bundles included, settings + - Suggest next step: `/olm:opm build-semver-index-image ` + +#### For `list`: +1. **Parse List Type** + - Extract list type from `$2` (must be `packages`, `channels`, or `bundles`) + - If invalid, display error with available types + +2. **Parse Index Reference and Optional Package** + - Extract index-ref from `$3` + - Extract optional package-name from `$4` (for channels and bundles) + +3. **Determine Reference Type** + - Check if directory: `test -d ` + +4. **Execute List Command** + - For packages: `opm alpha list packages ` + - For channels: `opm alpha list channels [package-name]` + - For bundles: `opm alpha list bundles [package-name]` + +5. **Display Results** + - Show the output with appropriate formatting + - Display count of items found + +## Return Value + +**Format**: Varies by action + +- **build-index-image / build-semver-index-image**: Success message with image tag, architectures, and bundle list +- **generate-semver-template**: Success message with file path and configuration details +- **list**: Table or list of catalog contents + +On failure, displays: +- Clear error message indicating which step/action failed +- Relevant tool output for debugging +- Suggestions for resolution + +## Notes + +- Ensure you are authenticated to container registries before building/pushing images (use `podman login`) +- For build operations, the `catalog.Dockerfile` is created in the current working directory +- Multi-architecture builds can be time-consuming +- Cacheless builds result in smaller images and use `scratch` as the base image +- When using `--cacheless`, the `--base-image` and `--builder-image` options are ignored (scratch is always used as base) +- Index references can be either image tags or local directory paths +- Bundle images must be accessible from where you build the catalog +- Image tags should include the full registry hostname (e.g., `quay.io/org/image:tag` not `quay/org/image:tag`) + +## Related Commands + +- `/olm:install` - Install an operator using OLM +- `/olm:catalog` - Manage catalog sources +- `/olm:debug` - Debug OLM issues diff --git a/plugins/olm/commands/uninstall.md b/plugins/olm/commands/uninstall.md new file mode 100644 index 000000000..36c3ec14c --- /dev/null +++ b/plugins/olm/commands/uninstall.md @@ -0,0 +1,392 @@ +--- +description: Uninstall a day-2 operator and optionally remove its resources +argument-hint: [namespace] [--remove-crds] [--remove-namespace] +--- + +## Name +olm:uninstall + +## Synopsis +``` +/olm:uninstall [namespace] [--remove-crds] [--remove-namespace] +``` + +## Description +The `olm:uninstall` command uninstalls a day-2 operator from an OpenShift cluster by removing its Subscription, ClusterServiceVersion (CSV), and optionally its Custom Resource Definitions (CRDs) and namespace. + +This command provides a comprehensive uninstallation workflow: +- Removes the operator's Subscription +- Deletes the ClusterServiceVersion (CSV) +- Optionally removes operator-managed deployments +- Optionally deletes Custom Resource Definitions (CRDs) +- Optionally removes the operator's namespace +- Provides detailed feedback on each step + +The command is designed to safely clean up operators installed via OLM, with optional flags for thorough cleanup of all operator-related resources. + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - The name of the operator to uninstall + - `$2`: Namespace (optional) - The namespace where operator is installed. If not provided, defaults to `{operator-name}-operator` + - `$3+`: Flags (optional): + - `--remove-crds`: Remove Custom Resource Definitions after uninstalling + - `--remove-namespace`: Remove the operator's namespace after cleanup + - `--force`: Skip confirmation prompts + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has cluster-admin or sufficient privileges + +3. **Verify Operator Installation**: + - Check if namespace exists: + ```bash + oc get namespace {namespace} --ignore-not-found + ``` + - Check if subscription exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If not found, display error: "Operator {operator-name} is not installed in namespace {namespace}" + - List what will be uninstalled + +4. **Display Uninstallation Plan**: + - Show operator details: + ```bash + oc get subscription {operator-name} -n {namespace} -o yaml + oc get csv -n {namespace} + ``` + - Display what will be removed: + - Subscription name and namespace + - CSV name and version + - Deployments (if any) + - CRDs (if `--remove-crds` flag is set) + - Namespace (if `--remove-namespace` flag is set) + +5. **Request User Confirmation** (unless `--force` flag is set): + - Display warning: + ``` + WARNING: You are about to uninstall {operator-name} from namespace {namespace}. + This will remove: + - Subscription: {subscription-name} + - ClusterServiceVersion: {csv-name} + - Operator deployments + [- Custom Resource Definitions (if --remove-crds is set)] + [- Namespace {namespace} (if --remove-namespace is set)] + + Are you sure you want to continue? (yes/no) + ``` + - Wait for user confirmation + - If user says no, abort operation + +6. **Delete Subscription**: + - Remove the operator's subscription: + ```bash + oc delete subscription {operator-name} -n {namespace} + ``` + - Verify deletion: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - Display result + +7. **Delete ClusterServiceVersion (CSV)**: + - Get the CSV name: + ```bash + oc get csv -n {namespace} -o jsonpath='{.items[?(@.spec.displayName contains "{operator-name}")].metadata.name}' + ``` + - Delete the CSV: + ```bash + oc delete csv {csv-name} -n {namespace} + ``` + - This will automatically remove operator deployments + - Verify CSV is deleted: + ```bash + oc get csv -n {namespace} --ignore-not-found + ``` + +8. **Remove Operator Deployments** (if still present): + - List deployments created by the operator: + ```bash + oc get deployments -n {namespace} + ``` + - For operators like cert-manager with labeled resources: + ```bash + oc delete deployment -n {namespace} -l app.kubernetes.io/instance={operator-base-name} + ``` + - Verify deployments are deleted: + ```bash + oc get deployments -n {namespace} + ``` + +8.5. **Check for Orphaned Custom Resources** (before removing CRDs): + - Get list of CRDs managed by the operator from CSV: + ```bash + oc get csv -n {namespace} -o jsonpath='{.items[0].spec.customresourcedefinitions.owned[*].name}' + ``` + - For each CRD, search for CR instances across all namespaces: + ```bash + oc get --all-namespaces --ignore-not-found + ``` + - If CRs exist, list them with details: + ``` + WARNING: Found custom resources that may prevent clean uninstallation: + - namespace-1/ (kind: ) + - namespace-2/ (kind: ) + + These resources should be deleted before uninstalling the operator. + Do you want to delete these custom resources? (yes/no) + ``` + - If user confirms, delete each CR: + ```bash + oc delete -n + ``` + - This prevents namespace from getting stuck in Terminating state + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + +9. **Remove Custom Resource Definitions** (if `--remove-crds` flag is set): + - **WARNING**: Display critical warning to user: + ``` + WARNING: Removing CRDs will delete ALL custom resources of these types across the entire cluster! + This action is irreversible and will affect all namespaces. + + Are you absolutely sure you want to remove CRDs? (yes/no) + ``` + - If user confirms, proceed with CRD removal + - Get list of CRDs owned by the operator: + ```bash + oc get csv {csv-name} -n {namespace} -o jsonpath='{.spec.customresourcedefinitions.owned[*].name}' + ``` + - For each CRD, check if custom resources exist: + ```bash + oc get {crd-name} --all-namespaces --ignore-not-found + ``` + - Display warning if custom resources exist + - Delete CRDs: + ```bash + oc delete crd {crd-name} + ``` + +10. **Remove Namespace** (if `--remove-namespace` flag is set): + - **WARNING**: Display warning: + ``` + WARNING: Removing namespace {namespace} will delete all resources in this namespace! + + Are you sure you want to remove namespace {namespace}? (yes/no) + ``` + - If user confirms: + ```bash + oc delete namespace {namespace} + ``` + - Monitor namespace deletion with timeout: + ```bash + oc wait --for=delete namespace/{namespace} --timeout=120s + ``` + - If namespace gets stuck in "Terminating" state after 120 seconds: + - Check for resources preventing deletion: + ```bash + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + ``` + - Check for finalizers on the namespace: + ```bash + oc get namespace {namespace} -o jsonpath='{.metadata.finalizers}' + ``` + - Display helpful error message: + ``` + ERROR: Namespace {namespace} is stuck in Terminating state. + + Possible causes: + - Resources with finalizers preventing deletion + - API services that are unavailable + - Custom resources that cannot be deleted + + To diagnose and fix, run: /olm:diagnose {operator-name} {namespace} + + Manual troubleshooting: + 1. Check remaining resources: + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + + 2. Check namespace finalizers: + oc get namespace {namespace} -o yaml | grep -A5 finalizers + + WARNING: Do NOT force-delete the namespace as it can lead to unstable cluster behavior. + See: https://access.redhat.com/solutions/4165791 + ``` + - Exit with error code + - Note: OperatorGroup will be automatically deleted with the namespace + +11. **Post-Uninstall Verification**: + - Verify all resources are cleaned up: + ```bash + oc get subscription,csv,installplan -n {namespace} --ignore-not-found + ``` + - Check if any CRDs remain (if they were supposed to be deleted): + ```bash + oc get crd | grep + ``` + - If uninstalling without `--remove-namespace`, check namespace is clean: + ```bash + oc get all -n {namespace} + ``` + - Display any remaining resources with suggestions for cleanup + +12. **Display Uninstallation Summary**: + - Show what was successfully removed: + ``` + ✓ Uninstallation Summary: + ✓ Subscription '{operator-name}' deleted + ✓ CSV '{csv-name}' deleted + ✓ Operator deployments removed + [✓ X custom resources deleted] + [✓ Y CRDs removed] + [✓ Namespace '{namespace}' deleted] + ``` + - If CRDs or namespace were NOT removed, provide instructions: + ``` + Note: The following resources were NOT removed: + - Custom Resource Definitions (use --remove-crds to remove) + - Namespace {namespace} (use --remove-namespace to remove) + + To completely remove all operator resources, run: + /olm:uninstall {operator-name} {namespace} --remove-crds --remove-namespace + ``` + - **Important warning about reinstallation**: + ``` + IMPORTANT: Before reinstalling this operator, verify all resources are cleaned: + + oc get subscription,csv,installplan -n {namespace} + oc get crd | grep + + Failure to completely uninstall may cause reinstallation issues. + See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + ``` + +## Return Value +- **Success**: Operator uninstalled successfully with summary of removed resources +- **Partial Success**: Some resources removed with warnings about remaining resources +- **Error**: Uninstallation failed with specific error message +- **Format**: Structured output showing: + - Subscription deletion status + - CSV deletion status + - Deployment removal status + - CRD removal status (if applicable) + - Namespace deletion status (if applicable) + +## Examples + +1. **Uninstall cert-manager-operator (basic)**: + ``` + /olm:uninstall openshift-cert-manager-operator + ``` + +2. **Uninstall with custom namespace**: + ``` + /olm:uninstall openshift-cert-manager-operator my-cert-manager + ``` + +3. **Complete cleanup including namespace**: + ``` + /olm:uninstall openshift-cert-manager-operator cert-manager-operator --remove-crds --remove-namespace + ``` + This performs a complete cleanup of all operator-related resources. + +4. **Force uninstall without prompts**: + ``` + /olm:uninstall openshift-cert-manager-operator cert-manager-operator --force + ``` + Skips all confirmation prompts (use with caution!). + +## Arguments +- **$1** (operator-name): The name of the operator to uninstall (required) + - Example: "openshift-cert-manager-operator" + - Must match the Subscription name +- **$2** (namespace): The namespace where operator is installed (optional) + - Default: `{operator-name}` (operator name without "openshift-" prefix) + - Example: "cert-manager-operator" +- **$3+** (flags): Optional flags (can combine multiple): + - `--remove-crds`: Remove Custom Resource Definitions (WARNING: affects entire cluster) + - `--remove-namespace`: Remove the operator's namespace and all its resources + - `--force`: Skip all confirmation prompts (use with caution) + +## Safety Features + +1. **Multiple Confirmations**: Separate confirmations for CRD and namespace removal +2. **Detailed Warnings**: Clear warnings about the scope of deletions +3. **Verification Steps**: Checks that resources exist before attempting deletion +4. **Summary Report**: Detailed summary of what was and wasn't removed +5. **Graceful Failures**: Continues with remaining steps if individual deletions fail + +## Troubleshooting + +- **Subscription not found**: Verify the operator name and namespace: + ```bash + oc get subscriptions --all-namespaces | grep {operator-name} + ``` +- **CSV won't delete**: Check for finalizers: + ```bash + oc get csv {csv-name} -n {namespace} -o yaml | grep finalizers + ``` + If finalizers are present, they may be waiting for resources to be cleaned up. Check operator logs and events. + +- **Namespace stuck in Terminating**: This is a common issue after operator uninstallation. + ```bash + # Find remaining resources + oc api-resources --verbs=list --namespaced -o name | \ + xargs -n 1 oc get --show-kind --ignore-not-found -n {namespace} + + # Check namespace finalizers + oc get namespace {namespace} -o yaml | grep -A5 finalizers + ``` + **IMPORTANT**: Do not force-delete the namespace. This can cause cluster instability. + Instead, use `/olm:diagnose {operator-name} {namespace}` to diagnose and fix the issue. + +- **CRDs won't delete**: Check for remaining custom resources: + ```bash + oc get {crd-name} --all-namespaces + ``` + CRDs cannot be deleted while CR instances exist. Delete all CRs first. + +- **Custom resources won't delete**: Some CRs may have finalizers preventing deletion: + ```bash + oc get -n -o yaml | grep finalizers + ``` + The operator controller (if still running) should remove finalizers. If operator is already deleted, you may need to manually patch the CR to remove finalizers (use with extreme caution). + +- **Permission denied**: Ensure you have cluster-admin privileges for CRD deletion: + ```bash + oc auth can-i delete crd + ``` + +- **Reinstallation fails after uninstall**: This usually means cleanup was incomplete. + Run these checks before reinstalling: + ```bash + # Check for remaining subscriptions/CSVs + oc get subscription,csv -n {namespace} + + # Check for remaining CRDs + oc get crd | grep + + # Check if namespace is clean or stuck + oc get namespace {namespace} + ``` + See: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues + +## Related Commands + +- `/olm:install` - Install a day-2 operator +- `/olm:list` - List installed operators +- `/olm:status` - Check operator status before uninstalling +- `/olm:diagnose` - Diagnose and fix uninstallation issues +- `/olm:upgrade` - Upgrade an operator + +## Additional Resources + +- [Red Hat OpenShift: Deleting Operators from a cluster](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-deleting-operators-from-a-cluster) +- [Red Hat OpenShift: Reinstalling Operators after failed uninstallation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-reinstalling-operators-after-failed-uninstallation_olm-troubleshooting-operator-issues) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + diff --git a/plugins/olm/commands/upgrade.md b/plugins/olm/commands/upgrade.md new file mode 100644 index 000000000..75434f615 --- /dev/null +++ b/plugins/olm/commands/upgrade.md @@ -0,0 +1,349 @@ +--- +description: Update an operator to the latest version or switch channels +argument-hint: [namespace] [--channel=] [--approve] +--- + +## Name +olm:upgrade + +## Synopsis +``` +/olm:upgrade [namespace] [--channel=] [--approve] +``` + +## Description +The `olm:upgrade` command updates an installed operator to the latest version in its current channel or switches to a different channel. It can also approve pending InstallPlans for operators with manual approval mode. + +This command helps you: +- Update operators to the latest version in their channel +- Switch operators to different channels (e.g., stable to tech-preview) +- Approve pending upgrade InstallPlans for manual approval mode +- Monitor upgrade progress +- Rollback on failure (if possible via OLM) + +## Implementation + +The command performs the following steps: + +1. **Parse Arguments**: + - `$1`: Operator name (required) - Name of the operator to upgrade + - `$2`: Namespace (optional) - Namespace where operator is installed + - If not provided, searches for the operator across all namespaces + - `$3+`: Flags (optional): + - `--channel=`: Switch to a different channel + - `--approve`: Automatically approve pending InstallPlan (for manual approval mode) + +2. **Prerequisites Check**: + - Verify `oc` CLI is installed: `which oc` + - Verify cluster access: `oc whoami` + - Check if user has sufficient privileges + +3. **Locate Operator**: + - If namespace provided, verify operator exists: + ```bash + oc get subscription {operator-name} -n {namespace} --ignore-not-found + ``` + - If no namespace provided, search across all namespaces: + ```bash + oc get subscription --all-namespaces -o json | jq -r '.items[] | select(.spec.name=="{operator-name}") | .metadata.namespace' + ``` + - If not found, display error with suggestions + - If multiple instances found, prompt user to specify namespace + +4. **Get Current State**: + - Get current Subscription: + ```bash + oc get subscription {operator-name} -n {namespace} -o json + ``` + - Extract: + - Current channel: `.spec.channel` + - Install plan approval: `.spec.installPlanApproval` + - Installed CSV: `.status.installedCSV` + - Current CSV: `.status.currentCSV` + - Get current CSV version: + ```bash + oc get csv {installed-csv} -n {namespace} -o jsonpath='{.spec.version}' + ``` + +5. **Check for Available Updates**: + - Get PackageManifest: + ```bash + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + - Extract available channels and their latest versions + - If `--channel` flag is specified, verify channel exists + - If no channel flag, check for updates in current channel + - Compare current version with latest available version + - Reference: https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators + +6. **Display Upgrade Plan**: + ``` + Operator Upgrade Plan: + + Operator: {display-name} + Namespace: {namespace} + Current Version: {current-version} + Current Channel: {current-channel} + + [If switching channels:] + Target Channel: {new-channel} + Target Version: {new-version} + + [If upgrading in same channel:] + Latest Version: {latest-version} (in channel: {current-channel}) + + Approval Mode: {Automatic|Manual} + ``` + +7. **Check for Pending InstallPlans** (for manual approval mode): + - Get pending InstallPlans: + ```bash + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + ``` + - If pending InstallPlan exists and `--approve` flag is set: + - Display InstallPlan details + - Approve the InstallPlan (skip to step 9) + - If pending InstallPlan exists and no `--approve` flag: + ``` + ⏸️ Pending InstallPlan found (requires manual approval) + + InstallPlan: {installplan-name} + Target Version: {target-version} + + To approve: /olm:upgrade {operator-name} {namespace} --approve + Or use: /olm:approve {operator-name} {namespace} + ``` + - Exit, waiting for user to approve + +8. **Perform Channel Switch** (if `--channel` flag provided): + - Confirm with user (unless `--force` flag): + ``` + WARNING: Switching channels may upgrade or downgrade the operator. + + Current: {current-channel} ({current-version}) + Target: {new-channel} ({target-version}) + + Continue? (yes/no) + ``` + - Update Subscription to new channel: + ```bash + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{new-channel}"}}' + ``` + - Display confirmation: + ``` + ✓ Subscription updated to channel: {new-channel} + ``` + +9. **Approve Pending InstallPlan** (if `--approve` flag or automatic approval): + - If approval mode is Manual and `--approve` flag is set: + ```bash + oc patch installplan {installplan-name} -n {namespace} \ + --type merge --patch '{"spec":{"approved":true}}' + ``` + - Display approval confirmation: + ``` + ✓ InstallPlan approved: {installplan-name} + ``` + +10. **Monitor Upgrade Progress**: + - Wait for new InstallPlan to be created (if switching channels): + ```bash + oc get installplan -n {namespace} -w --timeout=60s + ``` + - Wait for new CSV to reach "Succeeded" phase: + ```bash + oc get csv -n {namespace} -w --timeout=300s + ``` + - Display progress updates: + ``` + 🔄 Upgrade in progress... + ⏳ Waiting for InstallPlan to complete... + ⏳ New CSV installing: {new-csv-name} + ⏳ Old CSV replacing: {old-csv-name} + ``` + - Poll every 10 seconds to check status + - Timeout: 10 minutes for upgrade to complete + +11. **Verify Upgrade Success**: + - Check new CSV status: + ```bash + oc get csv -n {namespace} -o json + ``` + - Verify new CSV phase is "Succeeded" + - Get new version: + ```bash + oc get csv {new-csv-name} -n {namespace} -o jsonpath='{.spec.version}' + ``` + - Check deployments are healthy: + ```bash + oc get deployments -n {namespace} + ``` + - Check pods are running: + ```bash + oc get pods -n {namespace} + ``` + +12. **Display Upgrade Summary**: + ``` + ✓ Operator Upgrade Complete! + + Operator: {display-name} + Namespace: {namespace} + Previous Version: {old-version} + Current Version: {new-version} + Channel: {channel} + + Deployment Status: + - {deployment-1}: 1/1 replicas ready + - {deployment-2}: 1/1 replicas ready + + To check status: /olm:status {operator-name} {namespace} + ``` + +13. **Handle Upgrade Failures**: + - If upgrade fails or times out: + ``` + ❌ Operator upgrade failed + + Current State: + - CSV: {csv-name} (Phase: {phase}) + - Message: {error-message} + + Troubleshooting steps: + 1. Check CSV status: oc describe csv {csv-name} -n {namespace} + 2. Check events: oc get events -n {namespace} --sort-by='.lastTimestamp' + 3. Check InstallPlan: oc get installplan -n {namespace} + 4. Run diagnostics: /olm:diagnose {operator-name} {namespace} + + To rollback (if OLM supports): + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{old-channel}"}}' + ``` + +## Return Value +- **Success**: Operator upgraded successfully with new version details +- **Pending Approval**: Upgrade waiting for manual approval with instructions +- **No Update Available**: Operator is already at the latest version +- **Error**: Upgrade failed with specific error message and troubleshooting guidance +- **Format**: Structured output showing: + - Previous and current versions + - Channel information + - Deployment and pod status + - Next steps or related commands + +## Examples + +1. **Check for and install updates in current channel**: + ``` + /olm:upgrade openshift-cert-manager-operator + ``` + +2. **Upgrade with specific namespace**: + ``` + /olm:upgrade external-secrets-operator eso-operator + ``` + +3. **Switch to a different channel**: + ``` + /olm:upgrade openshift-cert-manager-operator cert-manager-operator --channel=tech-preview-v1.14 + ``` + This switches from stable-v1 to tech-preview-v1.14 channel. + +4. **Approve pending upgrade (manual approval mode)**: + ``` + /olm:upgrade openshift-cert-manager-operator --approve + ``` + +5. **Switch channel and approve in one command**: + ``` + /olm:upgrade prometheus prometheus-operator --channel=beta --approve + ``` + +## Arguments +- **$1** (operator-name): Name of the operator to upgrade (required) + - Example: "openshift-cert-manager-operator" + - Must match the operator's Subscription name +- **$2** (namespace): Namespace where operator is installed (optional) + - If not provided, searches all namespaces + - Example: "cert-manager-operator" +- **$3+** (flags): Optional flags + - `--channel=`: Switch to specified channel + - Example: `--channel=stable-v1`, `--channel=tech-preview` + - Triggers upgrade/downgrade to the version in that channel + - `--approve`: Automatically approve pending InstallPlan + - Only needed for operators with Manual approval mode + - Equivalent to `/olm:approve` command + +## Notes + +- **Automatic Updates**: Operators with `installPlanApproval: Automatic` will upgrade automatically when new versions are available in their channel +- **Manual Approval**: Operators with `installPlanApproval: Manual` require explicit approval via `--approve` flag or `/olm:approve` command +- **Channel Switching**: Changing channels may result in upgrade or downgrade depending on the versions in each channel +- **Rollback**: OLM has limited rollback support. Switching back to the previous channel may work, but data migration issues may occur +- **Upgrade Timing**: Upgrades happen according to the operator's upgrade strategy (some may cause downtime) + +## Troubleshooting + +- **No updates available**: + ```bash + # Check current version + oc get csv -n {namespace} + + # Check available versions + oc get packagemanifest {operator-name} -n openshift-marketplace -o json + ``` + +- **Upgrade stuck or pending**: + ```bash + # Check InstallPlan status + oc get installplan -n {namespace} + + # Check for events + oc get events -n {namespace} --sort-by='.lastTimestamp' | tail -20 + ``` + +- **Manual approval required**: + ```bash + # List pending InstallPlans + oc get installplan -n {namespace} -o json | jq '.items[] | select(.spec.approved==false)' + + # Approve specific InstallPlan + /olm:approve {operator-name} {namespace} + ``` + +- **Upgrade failed**: + ```bash + # Check CSV status + oc describe csv -n {namespace} + + # Check operator logs + oc logs -n {namespace} deployment/{operator-deployment} + + # Run diagnostics + /olm:diagnose {operator-name} {namespace} + ``` + +- **Rollback needed**: + - OLM doesn't have built-in rollback + - Can try switching back to previous channel, but may have issues: + ```bash + oc patch subscription {operator-name} -n {namespace} \ + --type merge --patch '{"spec":{"channel":"{old-channel}"}}' + ``` + - Consider backup/restore of custom resources before upgrading + +## Related Commands + +- `/olm:status ` - Check current version and available updates +- `/olm:approve ` - Approve pending InstallPlans +- `/olm:install ` - Install an operator +- `/olm:diagnose ` - Diagnose upgrade issues + +## Additional Resources + +- [Red Hat OpenShift: Updating Installed Operators](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-updating-operators) +- [Red Hat OpenShift: Approving Operator Upgrades](https://docs.redhat.com/en/documentation/openshift_container_platform/4.20/html/operators/administrator-tasks#olm-approving-operator-upgrades_olm-updating-operators) +- [Operator Lifecycle Manager Documentation](https://olm.operatorframework.io/) + +