diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index da8eed40b..65db0ebea 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -238,6 +238,12 @@ "source": "./plugins/marketplace-ops", "description": "Maintenance commands for Claude Code plugin marketplaces", "version": "0.1.2" + }, + { + "name": "node-team", + "source": "./plugins/node-team", + "description": "OpenShift Node team assistant for development, deployment, debugging, and workflow tasks", + "version": "0.11.0" } ] } diff --git a/plugins/node-team/.claude-plugin/plugin.json b/plugins/node-team/.claude-plugin/plugin.json new file mode 100644 index 000000000..a42ce30ef --- /dev/null +++ b/plugins/node-team/.claude-plugin/plugin.json @@ -0,0 +1,8 @@ +{ + "name": "node-team", + "description": "OpenShift Node team assistant for development, deployment, debugging, and workflow tasks across kubelet, MCO, CRI-O, crun, conmonrs, Kueue operator, Jira, Red Hat KB/support cases, Prometheus, and platform docs.", + "version": "0.12.0", + "author": { + "name": "github.com/openshift-eng" + } +} diff --git a/plugins/node-team/config/team-roster-core.example.json b/plugins/node-team/config/team-roster-core.example.json new file mode 100644 index 000000000..a110f5990 --- /dev/null +++ b/plugins/node-team/config/team-roster-core.example.json @@ -0,0 +1,7 @@ +{ + "description": "Node Core team roster — maps Jira display names to GitHub handles", + "members": { + "Jira Display Name": "github-handle", + "Another Person": "their-github-handle" + } +} diff --git a/plugins/node-team/config/team-roster-dra.example.json b/plugins/node-team/config/team-roster-dra.example.json new file mode 100644 index 000000000..de63d34fe --- /dev/null +++ b/plugins/node-team/config/team-roster-dra.example.json @@ -0,0 +1,7 @@ +{ + "description": "Node Devices (DRA) team roster — maps Jira display names to GitHub handles", + "members": { + "Jira Display Name": "github-handle", + "Another Person": "their-github-handle" + } +} diff --git a/plugins/node-team/skills/node/SKILL.md b/plugins/node-team/skills/node/SKILL.md new file mode 100644 index 000000000..df76697c0 --- /dev/null +++ b/plugins/node-team/skills/node/SKILL.md @@ -0,0 +1,10 @@ +--- +name: node-team +description: "OpenShift Node team assistant. Covers kubelet, MCO, CRI-O, crun, conmonrs, Kueue operator, Jira (OCPNODE/OCPBUGS), Red Hat KB/support cases, Prometheus, and K8s/OCP docs. Triggers on any OpenShift node-layer development, deployment, debugging, or workflow task." +allowed-tools: Bash(curl:*) +--- + +## How to use this skill + +1. Read [references/INDEX.md](references/INDEX.md) to route to the relevant reference +2. Read the reference, then act on it — run scripts, fetch data, present results diff --git a/plugins/node-team/skills/node/references/INDEX.md b/plugins/node-team/skills/node/references/INDEX.md new file mode 100644 index 000000000..bdff2d4d0 --- /dev/null +++ b/plugins/node-team/skills/node/references/INDEX.md @@ -0,0 +1,34 @@ +# Node Skill Reference Index + +Reference files contain only tribal knowledge and non-obvious nuances. For discoverable details (build commands, repo layout, test targets), browse the source code directly. + +Root: `./` + +## Setup + +|SETUP.md + +## Development + +|development:{kubelet-dev.md,mco-dev.md,crio-dev.md,crun-conmon.md,kueue-operator-dev.md,worktrees.md} + +## Deployment + +|deployment:{debug-binary.md} +|deployment/debug-binary:{crio.md,cross-compile.md,deploy.md,rollback.md,ssh-bastion.md} + +## Jira + +|jira.md + +## Red Hat Support + +|support.md + +## Platform Documentation + +|platform-docs.md + +## Prometheus + +|prometheus.md diff --git a/plugins/node-team/skills/node/references/SETUP.md b/plugins/node-team/skills/node/references/SETUP.md new file mode 100644 index 000000000..54221c85f --- /dev/null +++ b/plugins/node-team/skills/node/references/SETUP.md @@ -0,0 +1,94 @@ +# Standard Repo Setup + +All node team repos follow the same clone + worktree workflow for feature work. + +## Clone + +Clone into the current working directory (if not already present): + +```bash +git clone +cd +``` + +## Worktree for Feature Work + +Never work directly on the default branch. Create a worktree: + +```bash +git worktree add .worktrees/ -b wt/ +cd .worktrees/ +``` + +Deduce `` from the task description (e.g., "reflink feature" -> `reflink`, "fix cgroup leak" -> `fix-cgroup-leak`, "OCPNODE-1234" -> `ocpnode-1234`). + +## Worktree for PR Work + +To review or continue work on an existing PR: + +```bash +git fetch origin pull//head:pr- +git worktree add .worktrees/pr- pr- +cd .worktrees/pr- +``` + +If resuming work on a PR you've already fetched, check `git worktree list` first — the worktree may already exist. + +## Worktree for Jira Ticket Work + +To investigate or fix a Jira issue: + +1. Fetch the issue details to determine the component (see [jira.md](../jira.md) for auth setup): + ```bash + curl -s -u "$JIRA_USER:$JIRA_API_TOKEN" "https://redhat.atlassian.net/rest/api/3/issue/OCPNODE-1234?fields=summary,components" + ``` +2. Map the component to a repo (see Repo URLs below), confirm with the user, and clone if needed. +3. Create a worktree named after the ticket: + ```bash + git worktree add .worktrees/ocpnode-1234 -b wt/ocpnode-1234 + cd .worktrees/ocpnode-1234 + ``` + +## Component to Repo Mapping + +| Jira Label / Component | Repo | +|-------------------------|------| +| `crio` | cri-o | +| `kubelet` | kubernetes | +| `mco` | machine-config-operator | +| `crun` | crun | +| `conmonrs` | conmon-rs | +| `kueue` | kueue-operator | + +## Enable Node Assistant in the Worktree + +After creating a worktree, install the skill locally so it's available when you launch Claude there: + +```bash +cd .worktrees/ +claude plugin install node-assistant@node-skills --scope local +``` + +## Repo URLs + +| Component | Upstream | Downstream (OpenShift) | +|-----------|----------|------------------------| +| CRI-O | `https://github.com/cri-o/cri-o.git` | `https://github.com/openshift/cri-o.git` | +| Kubelet | `https://github.com/kubernetes/kubernetes.git` | `https://github.com/openshift/kubernetes.git` | +| MCO | — | `https://github.com/openshift/machine-config-operator.git` | +| crun | `https://github.com/containers/crun.git` | — | +| conmon-rs | `https://github.com/containers/conmon-rs.git` | — | +| Kueue Operator | `https://github.com/kubernetes-sigs/kueue.git` | `https://github.com/openshift/kueue-operator.git` | + +For upstream features and bug fixes, clone upstream. For OpenShift-specific work, clone downstream. + +## Cleanup + +```bash +# List worktrees +git worktree list + +# Remove when done +git worktree remove .worktrees/ +git branch -d wt/ +``` diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary.md b/plugins/node-team/skills/node/references/deployment/debug-binary.md new file mode 100644 index 000000000..5198b28c8 --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary.md @@ -0,0 +1,113 @@ +# Deploying Debug Binaries to RHCOS Nodes + +Deploy a custom-built binary (CRI-O, crun, kubelet, etc.) to an OpenShift worker node running RHCOS for debugging or POC testing. + +## The Challenge + +RHCOS (Red Hat Enterprise Linux CoreOS) has an **immutable `/usr` filesystem**. You cannot overwrite `/usr/bin/crio` or any other system binary directly. There is no package manager (`dnf`/`yum`), no compiler toolchain, and no development headers on the node. + +## The Solution + +**Bind-mount** your custom binary over the original. The bind mount shadows the original file without modifying the rootfs. The original binary remains intact underneath and is instantly recoverable by unmounting. + +``` +mount --bind /home/core/crio /usr/bin/crio +``` + +For cluster-wide deployment that survives reboots, use [layered images](layered-image.md) instead. + +## Four Phases + +### Phase 1: Build (Cross-Compile) + +Cross-compile the binary for `linux/amd64` using a Docker container that matches the target OS libraries. The binary must be dynamically linked against compatible library versions (same sonames as RHCOS). + +See [debug-binary/cross-compile.md](debug-binary/cross-compile.md) + +### Phase 2: Access (SSH Bastion) + +Reach the worker node via an SSH bastion pod. RHCOS nodes are not directly accessible from outside the cluster. You need to discover the SSH key used at cluster install time and deploy a bastion DaemonSet. + +See [debug-binary/ssh-bastion.md](debug-binary/ssh-bastion.md) + +### Phase 3: Deploy (Bind Mount) + +Transfer the binary to the node, verify it works, cordon/drain the node, set SELinux context, bind-mount over the original, restart the service. This phase has the most gotchas around SELinux, systemd, and service dependencies. + +See [debug-binary/deploy.md](debug-binary/deploy.md) + +### Phase 4: Rollback (Unmount) + +Unmount the bind mount, remove any config drop-ins, restart the service. The original binary is untouched underneath. + +See [debug-binary/rollback.md](debug-binary/rollback.md) + +## Binary-Specific References + +Each binary has its own reference with build dependencies, systemd units, and deployment details: + +- **CRI-O**: [debug-binary/crio.md](debug-binary/crio.md) -- build tags, library deps, kubelet restart, config drop-ins + +## Safety Rules + +These are non-negotiable. Skipping any of these can take a node out of the cluster. + +1. **Verify SSH bastion connectivity first.** Before building or deploying anything, confirm you can reach the target worker node via the bastion. Run `uname -a` over SSH. If you cannot reach the node, nothing else matters. + +2. **Always preflight-test the binary** before deploying. SCP it to `/home/core/`, run `ldd` to verify libraries resolve, and run ` --version` to confirm it loads. If either fails, do not proceed. + +3. **Always cordon and drain first.** Never restart a container runtime on a node with running workloads. + +4. **Always test on ONE worker node.** Keep at least one healthy worker to maintain cluster capacity. + +5. **Always set the SELinux context** before bind-mounting: + ```bash + sudo chcon --reference=/usr/bin/ /home/core/ + ``` + Without the correct context (e.g., `container_runtime_exec_t` for CRI-O), systemd will refuse to execute the binary with `Permission denied`. + +6. **Know how to rollback before you deploy.** The rollback is: unmount, restart service. Read [debug-binary/rollback.md](debug-binary/rollback.md) before starting. + +## Quick Reference + +| Step | Command | +|------|---------| +| Check node OS | `oc get nodes -o wide` | +| Check current binary version | SSH in, ` --version` | +| Cordon node | `oc adm cordon ` | +| Drain node | `oc adm drain --ignore-daemonsets --delete-emptydir-data` | +| Uncordon node | `oc adm uncordon ` | +| Verify node health | `oc get node ` (wait for Ready) | +| Check bind mounts | `ssh core@ "mount \| grep /usr/bin"` | + +## Deciding: Bind Mount vs Layered Image + +| | Bind Mount | Layered Image | +|---|---|---| +| Scope | Single node | All nodes in a pool | +| Survives reboot | No (unless systemd drop-in) | Yes | +| Speed | Minutes | 30-60 min (MCO rollout) | +| Use case | Quick debug/test | Cluster-wide validation, customer simulation | +| Rollback | `umount` | Delete MachineConfig | + +Use bind mounts for quick single-node testing. Use [layered images](layered-image.md) when you need the binary on all nodes or need it to persist across reboots. + +## Workflow Diagram + +``` +Local Machine RHCOS Worker Node +───────────── ───────────────── +1. Cross-compile in Docker + (linux/amd64, matching libs) + │ +2. SCP via bastion ─────────────► /home/core/ + 3. ldd, --version (preflight) + 4. chcon (SELinux) + │ + oc adm cordon/drain + 5. mount --bind + 6. systemctl restart + │ + oc adm uncordon + 7. Verify: --version, node Ready +``` diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary/crio.md b/plugins/node-team/skills/node/references/deployment/debug-binary/crio.md new file mode 100644 index 000000000..1e06075ad --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary/crio.md @@ -0,0 +1,195 @@ +# CRI-O Binary Reference + +## Binary Details + +| Property | Value | +|----------|-------| +| Binary path | `/usr/bin/crio` | +| Systemd unit | `crio.service` | +| Dependent service | `kubelet.service` (must restart after CRI-O restart) | +| RPM package | `cri-o` | +| SELinux context | `system_u:object_r:container_runtime_exec_t:s0` | +| Config drop-in dir | `/etc/crio/crio.conf.d/` | +| Linkmode | dynamic | + +## Build Dependencies (Debian/Bookworm) + +```dockerfile +RUN apt-get update && apt-get install -y --no-install-recommends \ + libseccomp-dev \ + libgpgme-dev \ + libassuan-dev \ + libgpg-error-dev \ + libselinux1-dev \ + pkg-config \ + make \ + git \ + && rm -rf /var/lib/apt/lists/* +``` + +## Dynamic Libraries + +CRI-O links against these shared libraries. The cross-compiled binary must show the same sonames in `ldd` output: + +``` +libseccomp.so.2 +libgpgme.so.11 +libassuan.so.0 +libgpg-error.so.0 +libc.so.6 +``` + +## Build Command + +```bash +make bin/crio +``` + +The Makefile auto-detects build tags based on available libraries. Expected tags on RHCOS-compatible builds: + +``` +containers_image_ostree_stub +exclude_graphdriver_btrfs +btrfs_noversion +seccomp +selinux +``` + +## Go Version + +Check `go.mod` for the required Go version. Use the matching `golang:-bookworm` Docker image. + +## Example Dockerfile + +```dockerfile +FROM --platform=linux/amd64 golang:1.23-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libseccomp-dev libgpgme-dev libassuan-dev \ + libgpg-error-dev libselinux1-dev \ + pkg-config make git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build/cri-o +COPY . . + +RUN make bin/crio && ldd bin/crio +``` + +## pinns Binary + +CRI-O uses `pinns` for pod namespace pinning. If your changes affect networking or namespace handling, you may need to deploy a custom `pinns` as well: + +| Property | Value | +|----------|-------| +| Binary path | `/usr/bin/pinns` | +| Build command | `make bin/pinns` | + +`pinns` is a small C binary. Build it alongside CRI-O in the same Dockerfile: + +```bash +make bin/pinns +``` + +Deploy it the same way as CRI-O (SCP, chcon, bind-mount). + +## CRI-O Preflight Checks + +After SCPing the binary, run these CRI-O-specific checks: + +```bash +# Verify libraries +ssh core@${WORKER} "ldd /home/core/crio" + +# Check version and build info +ssh core@${WORKER} "/home/core/crio --version" + +# Validate it can parse the existing config +ssh core@${WORKER} "/home/core/crio config 2>&1 | head -5" +``` + +If `crio config` fails, the binary may have been built without required build tags or is incompatible with the node's config format. + +## CRI-O Restart Behavior + +**Restarting CRI-O terminates all running containers on the node** and disconnects kubelet from the container runtime. Kubelet will go inactive and the node will become `NotReady`. + +After starting CRI-O, **always restart kubelet**: + +```bash +sudo systemctl restart crio +sudo systemctl restart kubelet +``` + +Wait ~15 seconds, then verify the node returns to `Ready`: + +```bash +oc get node +``` + +This is why you must cordon/drain before restarting CRI-O. Without draining, all running workloads will be killed. + +## Config Drop-ins + +CRI-O reads additional configuration from `/etc/crio/crio.conf.d/`. Files are processed in lexicographic order; later files override earlier ones. + +Example (setting a runtime option): + +```bash +ssh core@${WORKER} "sudo tee /etc/crio/crio.conf.d/01-custom.conf <<'EOF' +[crio.runtime] +default_runtime = \"crun\" +EOF" + +ssh core@${WORKER} "sudo systemctl restart crio && sudo systemctl restart kubelet" +``` + +## Verifying the Deployment + +```bash +# Check version and build info +ssh core@${WORKER} "sudo crio --version" + +# Check it is running +ssh core@${WORKER} "sudo systemctl is-active crio" + +# Check kubelet is connected +ssh core@${WORKER} "sudo systemctl is-active kubelet" + +# Check node status (from your workstation) +oc get node + +# Check CRI-O logs for errors +ssh core@${WORKER} "sudo journalctl -u crio --no-pager -n 20" +``` + +## Monitoring After Deployment + +Watch for issues after uncordoning: + +```bash +# Watch for CRI-O errors +ssh core@${WORKER} "sudo journalctl -u crio -f" & + +# Watch pod events on this node +oc get events --field-selector involvedObject.kind=Node,involvedObject.name= -w + +# Verify pods can be scheduled and start +oc run test-pod --image=registry.access.redhat.com/ubi9/ubi-minimal:latest \ + --overrides='{"spec":{"nodeName":""}}' \ + --command -- sleep 30 +oc get pod test-pod -w +oc delete pod test-pod +``` + +## CRI-O Rollback + +Follow the standard rollback procedure in [rollback.md](rollback.md) with these values: + +| Parameter | Value | +|-----------|-------| +| `` | `crio` | +| `` | `/usr/bin/crio` | +| `` | `kubelet` | +| `` | `/etc/crio/crio.conf.d/01-custom.conf` (if created) | +| `` | `cri-o` | diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary/cross-compile.md b/plugins/node-team/skills/node/references/deployment/debug-binary/cross-compile.md new file mode 100644 index 000000000..7003a1ecc --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary/cross-compile.md @@ -0,0 +1,160 @@ +# Cross-Compiling for RHCOS + +RHCOS worker nodes run `linux/amd64`. If you are building from an arm64 Mac (Apple Silicon), you need to cross-compile using Docker with QEMU emulation. + +## Why Not Build on the Node? + +RHCOS is an immutable OS. It has no package manager (`dnf`/`yum`), no development headers, and no Go toolchain. Building must happen off-cluster. + +## Why Not Build a Static Binary? + +RHCOS ships dynamically-linked binaries. The target binary must link against the same shared libraries (same sonames) as the RPM-installed version on RHCOS. A statically-linked binary might work but diverges from the production configuration and may miss features gated behind dynamic library detection (e.g., SELinux, seccomp, gpgme). + +## Build Procedure + +### 1. Determine the Go Version + +Check `go.mod` in the source directory: + +```bash +head -3 go.mod +``` + +Use the matching `golang:-bookworm` Docker image. + +### 2. Determine Library Dependencies + +SSH into the target node and check what the existing binary links against: + +```bash +ssh core@${WORKER} "ldd \$(which )" +``` + +The cross-compiled binary must link against the same sonames. + +### 3. Create a Dockerfile + +Use a base image with matching libraries. Debian Bookworm and Fedora both produce binaries with compatible sonames for RHCOS 9.x. + +The binary-specific reference (e.g., [crio.md](crio.md)) lists the exact packages and build tags needed. + +```dockerfile +FROM --platform=linux/amd64 golang:-bookworm + +RUN apt-get update && apt-get install -y --no-install-recommends \ + \ + pkg-config make git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build/ +COPY . . + +RUN make && ldd +``` + +### 4. Build + +```bash +docker build --platform linux/amd64 -f Dockerfile.cross -t -cross . +``` + +This uses QEMU emulation on arm64 Mac. Expect builds to take 2-5x longer than native. + +### 5. Extract the Binary + +```bash +docker create --platform linux/amd64 --name extract -cross +mkdir -p bin +docker cp extract:/build// ./bin/ +docker rm extract +``` + +### 6. Verify Architecture + +```bash +file ./bin/ +# Should show: ELF 64-bit LSB executable, x86-64 +``` + +## Go Cross-Compile Settings + +For Go binaries (CRI-O, conmon-rs components): + +```bash +GOOS=linux GOARCH=amd64 CGO_ENABLED=1 +``` + +`CGO_ENABLED=1` is required because these binaries link against C libraries (libseccomp, libgpgme, etc.). The Docker container provides the correct C toolchain for the target platform. + +## Rust Cross-Compile (conmon-rs) + +conmon-rs is written in Rust. Cross-compile for `x86_64-unknown-linux-gnu`: + +```bash +# In the Docker container +rustup target add x86_64-unknown-linux-gnu + +# Build +cargo build --release --target x86_64-unknown-linux-gnu +``` + +Use a Fedora or RHEL-based container with matching system libraries. The Dockerfile should install: + +```dockerfile +FROM --platform=linux/amd64 fedora:latest + +RUN dnf install -y \ + rust cargo \ + glib2-devel libseccomp-devel systemd-devel \ + capnproto capnp-devel \ + pkg-config make git \ + && dnf clean all + +WORKDIR /build/conmon-rs +COPY . . + +RUN cargo build --release && ldd target/release/conmonrs +``` + +## C Cross-Compile (crun) + +crun uses autotools. Build in a matching container: + +```dockerfile +FROM --platform=linux/amd64 fedora:latest + +RUN dnf install -y \ + gcc automake autoconf libtool \ + libcap-devel systemd-devel \ + yajl-devel libseccomp-devel \ + python3 git \ + && dnf clean all + +WORKDIR /build/crun +COPY . . + +RUN ./autogen.sh && ./configure && make +RUN ldd crun +``` + +## Verifying the Binary + +After extraction, verify in a matching container: + +```bash +# Run ldd inside a matching container to confirm library compatibility +docker run --platform linux/amd64 --rm -v $(pwd)/bin:/check debian:bookworm \ + ldd /check/ +``` + +All libraries must resolve. If any show `not found`, the binary was built against incompatible library versions. + +## Common Issues + +| Symptom | Cause | Fix | +|---------|-------|-----| +| `ldd` shows `not found` | Wrong base image or missing -dev package | Check sonames on target node, use matching base image | +| `GLIBC_x.xx not found` | glibc version mismatch | Use older base image (bookworm is usually safe for RHCOS 9.x) | +| Binary runs but features missing | Wrong build tags | Check binary-specific reference for required tags | +| Exec format error on node | Wrong architecture | Verify `file` output shows `x86-64` | +| Build extremely slow | QEMU emulation on arm64 Mac | Expected, 2-5x slower than native | diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary/deploy.md b/plugins/node-team/skills/node/references/deployment/debug-binary/deploy.md new file mode 100644 index 000000000..fa1bd71de --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary/deploy.md @@ -0,0 +1,198 @@ +# Deploying the Binary + +This is the critical phase. Follow the steps in order. Do not skip the preflight check. + +## Step 1: SCP Binary to Node + +Transfer the cross-compiled binary to the node via the SSH bastion: + +```bash +scp -i $SSH_KEY \ + -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -i $SSH_KEY -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ + ./bin/ core@${WORKER}:/home/core/ +``` + +Make it executable: + +```bash +ssh core@${WORKER} "chmod +x /home/core/" +``` + +## Step 2: Preflight Test + +Verify the binary works before touching anything: + +```bash +# Check libraries resolve +ssh core@${WORKER} "ldd /home/core/" + +# Check it runs +ssh core@${WORKER} "/home/core/ --version" +# or +ssh core@${WORKER} "/home/core/ -h" +``` + +If `ldd` shows `not found` for any library, the binary was built against incompatible versions. Go back to the cross-compile step. + +If `--version` fails, check the error -- it may be an architecture mismatch, missing library, or permissions issue. + +## Step 3: Set SELinux Context + +RHCOS runs SELinux in enforcing mode. Systemd checks the SELinux context of binaries before executing them. A binary in `/home/core/` has `user_home_t` context, which systemd will reject. + +Copy the context from the original binary: + +```bash +ssh core@${WORKER} "sudo chcon --reference= /home/core/" +``` + +Verify: + +```bash +ssh core@${WORKER} "ls -laZ /home/core/ " +``` + +Both should show the same context (e.g., `system_u:object_r:container_runtime_exec_t:s0` for CRI-O). + +Without this step, systemd will fail with: +``` +Failed to locate executable : Permission denied +``` + +## Step 4: Cordon Node + +Prevent new pods from being scheduled: + +```bash +oc adm cordon +``` + +## Step 5: Drain Node + +Evict existing workloads: + +```bash +oc adm drain --ignore-daemonsets --delete-emptydir-data --timeout=120s +``` + +If drain times out on a stuck pod: + +```bash +oc get pods --all-namespaces --field-selector spec.nodeName= | grep Terminating +oc delete pod -n --force --grace-period=0 +``` + +## Step 6: Create Bind Mount + +Stop the service, mount the new binary, start the service: + +```bash +# Stop the service +ssh core@${WORKER} "sudo systemctl stop " + +# Bind mount the new binary over the original +ssh core@${WORKER} "sudo mount --bind /home/core/ " + +# Start the service +ssh core@${WORKER} "sudo systemctl start " +``` + +The bind mount shadows the original binary without modifying it. The original remains intact underneath. + +## Step 7: Make Persistent (Optional) + +The bind mount does not survive a reboot. To make it persistent, create a systemd drop-in that re-creates the mount before the service starts: + +```bash +ssh core@${WORKER} "sudo mkdir -p /etc/systemd/system/.d" + +ssh core@${WORKER} "sudo tee /etc/systemd/system/.d/10-bind-mount.conf <<'EOF' +[Service] +ExecStartPre=/usr/bin/mount --bind /home/core/ +EOF" + +ssh core@${WORKER} "sudo systemctl daemon-reload" +``` + +## Step 8: Restart Service and Dependents + +Verify the service is running with the new binary: + +```bash +# Check service status +ssh core@${WORKER} "sudo systemctl is-active " + +# Verify the new version +ssh core@${WORKER} "sudo --version" + +# Restart dependent services (e.g., kubelet after CRI-O restart) +ssh core@${WORKER} "sudo systemctl restart " +ssh core@${WORKER} "sudo systemctl is-active " +``` + +Check the binary-specific reference (e.g., [crio.md](crio.md)) for which dependent services need restarting. + +## Step 9: Verify Node Health + +```bash +# Wait for the node to become Ready +oc get node +``` + +If the node stays `NotReady`, check dependent services. A common issue is forgetting to restart kubelet after CRI-O restart. + +## Step 10: Uncordon + +```bash +oc adm uncordon +``` + +## Optional: Config Drop-ins + +To add configuration (e.g., feature flags), write a drop-in file and restart: + +```bash +ssh core@${WORKER} "sudo tee <<'EOF' + +EOF" + +ssh core@${WORKER} "sudo systemctl restart " +``` + +## Updating an Already-Deployed Binary + +If you need to deploy a newer version and the bind mount is already in place: + +1. SCP the new binary to a **different filename** (the mounted path is busy) +2. Stop the service +3. Unmount the old bind mount +4. Rename the new file to the expected name +5. Set SELinux context +6. Mount and start + +```bash +scp core@${WORKER}:/home/core/-v2 + +ssh core@${WORKER} "sudo systemctl stop && \ + sudo umount && \ + mv /home/core/-v2 /home/core/ && \ + chmod +x /home/core/ && \ + sudo chcon --reference= /home/core/ && \ + sudo mount --bind /home/core/ && \ + sudo systemctl start " +``` + +## Full Single-Command Deploy + +For convenience, after preflight passes and SELinux is set: + +```bash +oc adm cordon && \ +oc adm drain --ignore-daemonsets --delete-emptydir-data --timeout=120s && \ +ssh core@${WORKER} "sudo systemctl stop && \ + sudo mount --bind /home/core/ && \ + sudo systemctl start && \ + sudo systemctl restart " && \ +oc adm uncordon +``` diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary/rollback.md b/plugins/node-team/skills/node/references/deployment/debug-binary/rollback.md new file mode 100644 index 000000000..1b3f9fde1 --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary/rollback.md @@ -0,0 +1,137 @@ +# Rollback + +Rollback is straightforward because the bind mount never touched the original binary. The original is intact underneath the mount. + +## Procedure + +### Step 1: Cordon and Drain + +```bash +oc adm cordon +oc adm drain --ignore-daemonsets --delete-emptydir-data --timeout=120s +``` + +### Step 2: Stop Service and Unmount + +```bash +# Stop the service +ssh core@${WORKER} "sudo systemctl stop " + +# Unmount the bind mount (restores original binary) +ssh core@${WORKER} "sudo umount " +``` + +### Step 3: Remove Systemd Drop-ins + +If you created a systemd drop-in to persist the mount across reboots, remove it: + +```bash +ssh core@${WORKER} "sudo rm -rf /etc/systemd/system/.d" +ssh core@${WORKER} "sudo systemctl daemon-reload" +``` + +### Step 4: Remove Config Drop-ins + +If you added any configuration drop-in files: + +```bash +ssh core@${WORKER} "sudo rm -f " +``` + +### Step 5: Start Service and Dependents + +```bash +# Start the service (now using the original binary) +ssh core@${WORKER} "sudo systemctl start " + +# Restart dependent services +ssh core@${WORKER} "sudo systemctl restart " + +# Verify original version +ssh core@${WORKER} "sudo --version" +``` + +### Step 6: Verify Node Health + +```bash +oc get node +``` + +Wait for `Ready` status. If the node does not recover, check service logs: + +```bash +ssh core@${WORKER} "sudo journalctl -u --no-pager -n 30" +``` + +### Step 7: Uncordon + +```bash +oc adm uncordon +``` + +## Quick Rollback (Single Command) + +For when you need to rollback fast: + +```bash +oc adm cordon && \ +oc adm drain --ignore-daemonsets --delete-emptydir-data --timeout=120s && \ +ssh core@${WORKER} "sudo systemctl stop && \ + sudo umount && \ + sudo rm -rf /etc/systemd/system/.d && \ + sudo rm -f && \ + sudo systemctl daemon-reload && \ + sudo systemctl start && \ + sudo systemctl restart " && \ +oc adm uncordon +``` + +## Cleanup + +The debug binary remains at `/home/core/` after unmounting. Remove it if no longer needed: + +```bash +ssh core@${WORKER} "rm /home/core/" +``` + +## Troubleshooting + +### Service will not start after rollback + +This should not happen since the original binary is untouched, but if it does: + +1. **Verify the unmount happened:** + ```bash + ssh core@${WORKER} "mount | grep " + ``` + If it still shows a bind mount, run `sudo umount ` again. + +2. **Verify the original binary is intact:** + ```bash + ssh core@${WORKER} "rpm -V " + ``` + +3. **Restore SELinux context on the original:** + ```bash + ssh core@${WORKER} "sudo restorecon " + ``` + +4. **Check logs:** + ```bash + ssh core@${WORKER} "sudo journalctl -u -n 50" + ``` + +### Unmount fails with "target is busy" + +The service is still using the binary. Stop it first: + +```bash +ssh core@${WORKER} "sudo systemctl stop " +ssh core@${WORKER} "sudo umount " +``` + +If it is still busy, check for other processes using it: + +```bash +ssh core@${WORKER} "sudo fuser -v " +``` diff --git a/plugins/node-team/skills/node/references/deployment/debug-binary/ssh-bastion.md b/plugins/node-team/skills/node/references/deployment/debug-binary/ssh-bastion.md new file mode 100644 index 000000000..69d27b86d --- /dev/null +++ b/plugins/node-team/skills/node/references/deployment/debug-binary/ssh-bastion.md @@ -0,0 +1,150 @@ +# SSH Bastion Access to RHCOS Nodes + +RHCOS worker nodes are not directly accessible via SSH. The [ssh-bastion](https://github.com/eparis/ssh-bastion) project deploys a bastion pod that proxies SSH connections to cluster nodes. + +## Setup + +### 1. Deploy the Bastion + +Use the deploy script from the upstream repo: + +```bash +curl -sL https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/deploy.sh | bash +``` + +The script creates the `openshift-ssh-bastion` namespace, deploys the bastion pod, and prints the LoadBalancer IP. + +If the script is unavailable, apply the individual manifests: + +```bash +for f in serviceaccount role clusterrole deployment service; do + oc apply -f "https://raw.githubusercontent.com/eparis/ssh-bastion/master/deploy/${f}.yaml" +done +``` + +**LoadBalancer warm-up:** After deployment, the cloud LoadBalancer (especially on GCP) takes 30-60 seconds to become reachable. SSH connections will be refused during this period. Wait and retry -- do not assume the bastion is broken. + +```bash +sleep 30 +ssh -i $SSH_KEY -o ConnectTimeout=15 core@${BASTION_HOST} echo "connected" +``` + +### 2. Discover the SSH Key + +The cluster's `99-worker-ssh` MachineConfig contains the authorized public key. Match it against your local keys: + +```bash +# Get the public key baked into the nodes +oc get machineconfig 99-worker-ssh -o jsonpath='{.spec.config.passwd.users[0].sshAuthorizedKeys[0]}' + +# Compare against local keys +for f in ~/.ssh/*.pub; do echo "=== $f ===" && cat "$f"; done +``` + +The matching key is what you need. Common gotcha: GCP clusters often use `~/.ssh/google_compute_engine`, not `~/.ssh/id_rsa`. + +### 3. Get the Bastion Host + +```bash +BASTION_HOST=$(oc get service --all-namespaces -l run=ssh-bastion \ + -o go-template='{{ with (index (index .items 0).status.loadBalancer.ingress 0) }}{{ or .hostname .ip }}{{end}}') +echo "Bastion: $BASTION_HOST" +``` + +## Running Commands + +Use raw SSH with the proxy command. The upstream `ssh-bastion.sh` script appends `sudo -i` which makes it unsuitable for non-interactive command execution. + +```bash +SSH_KEY=~/.ssh/ +BASTION_HOST= +WORKER= + +ssh -i $SSH_KEY \ + -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -i $SSH_KEY -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ + core@${WORKER} "" +``` + +## SCP (Transferring Files) + +Use SCP with the same proxy command: + +```bash +scp -i $SSH_KEY \ + -o StrictHostKeyChecking=no \ + -o ProxyCommand="ssh -i $SSH_KEY -A -o StrictHostKeyChecking=no -o ServerAliveInterval=30 -W %h:%p core@${BASTION_HOST}" \ + ./local-file core@${WORKER}:/home/core/remote-file +``` + +The upstream [scp.sh](https://github.com/eparis/ssh-bastion/blob/master/scp.sh) script is also available but requires `SSH_KEY_PATH` to be set. + +## Alternative: oc debug node + +For a quick shell on a node without setting up the bastion: + +```bash +oc debug node/ +chroot /host +``` + +This gives you a root shell on the node. Limitations: +- Cannot SCP files (no file transfer mechanism) +- Cannot run background processes reliably +- Session dies if the debug pod is evicted +- Runs as a pod, not a real SSH session + +Use `oc debug node` for inspection. Use the SSH bastion for deployment workflows that need SCP. + +## Writable Paths on RHCOS + +RHCOS has an immutable rootfs. You can only write to: +- `/home/core/` -- user home +- `/var/` -- variable data +- `/etc/` -- configuration (overlayed) +- `/tmp/` -- temporary + +Always SCP files to `/home/core/` first. + +## Gotcha: SCP Fails on Bind-Mounted Files + +If the target file is already bind-mounted (busy), SCP will fail with `Failure`. Copy to a new filename (e.g., `/home/core/binary-v2`), then swap after unmounting. + +## Troubleshooting + +### Bastion connectivity issues + +If SSH connections are intermittently refused (`Connection refused` on port 22) after the bastion pod is running: + +1. **Restart the bastion pod.** Deleting the pod lets the deployment recreate it: + +```bash +oc delete pod -n openshift-ssh-bastion -l run=ssh-bastion +sleep 30 +``` + +2. **Verify the pod is running and the LB has an IP:** + +```bash +oc get pods -n openshift-ssh-bastion -o wide +oc get svc -n openshift-ssh-bastion ssh-bastion +``` + +3. **Re-fetch the bastion IP** (it should not change, but confirm): + +```bash +BASTION_HOST=$(oc get service -n openshift-ssh-bastion ssh-bastion \ + -o go-template='{{ with (index (index .status.loadBalancer.ingress 0)) }}{{ or .hostname .ip }}{{end}}') +``` + +### Permission denied + +- Verify you are using the correct SSH key (see step 2 above) +- Verify you are connecting as user `core` (not `root`) +- Check that the SSH agent has the key loaded: `ssh-add -l` + +### Connection timeout + +- The node might not be reachable from the bastion network +- Verify the node internal IP: `oc get node -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}'` +- Check that the bastion pod is in the same VPC/network as the nodes diff --git a/plugins/node-team/skills/node/references/development/crio-dev.md b/plugins/node-team/skills/node/references/development/crio-dev.md new file mode 100644 index 000000000..dd34b84be --- /dev/null +++ b/plugins/node-team/skills/node/references/development/crio-dev.md @@ -0,0 +1,26 @@ +# CRI-O: Non-Obvious Notes (Tribal Knowledge) + +- **Upstream**: `https://github.com/cri-o/cri-o.git` +- **Downstream (OpenShift)**: `https://github.com/openshift/cri-o.git` + +For build commands, repo layout, dependencies, and test targets — browse the repo directly (Makefile, README, go.mod). + +## Branch Mapping + +OCP 4.X uses CRI-O 1.(X-4+17).x. The formula: **CRI-O minor = OCP minor + 13**. + +| OCP | CRI-O | +|-----|-------| +| 4.18 | 1.31.x | +| 4.17 | 1.30.x | +| 4.16 | 1.29.x | + +Downstream branches are `release-4.X`, upstream are `release-1.X`. + +## OpenShift-Specific + +- CRI-O **does not build natively on macOS** — CGO is required for seccomp and system libraries. Use a containerized build for cross-compilation. +- On RHCOS, CRI-O config is **managed by the MCO**. Do not edit `/etc/crio/crio.conf` directly — it will be overwritten. Use `ContainerRuntimeConfig` CRs instead. +- Drop-in files in `/etc/crio/crio.conf.d/` follow naming conventions: `00-default` (RHCOS base), `01-ctrcfg-*` (from ContainerRuntimeConfig CR), `10-*` (MCO overrides). +- Registry mirrors on OpenShift: configure via `ImageContentSourcePolicy` or `ImageDigestMirrorSet` CRs, not by editing `/etc/containers/registries.conf`. +- CRI-O exposes Prometheus metrics at `localhost:9537/metrics`. diff --git a/plugins/node-team/skills/node/references/development/crun-conmon.md b/plugins/node-team/skills/node/references/development/crun-conmon.md new file mode 100644 index 000000000..84c1e2f8b --- /dev/null +++ b/plugins/node-team/skills/node/references/development/crun-conmon.md @@ -0,0 +1,26 @@ +# crun and conmon-rs: Non-Obvious Notes (Tribal Knowledge) + +- **crun**: `https://github.com/containers/crun.git` (upstream only, no downstream fork) +- **conmon-rs**: `https://github.com/containers/conmon-rs.git` (upstream only, no downstream fork) + +For build commands, repo layout, and test targets — browse each repo directly (Makefile/Cargo.toml, README). + +## Version History + +- **crun** replaced runc as the default OCI runtime starting in **OCP 4.12**. +- **conmon-rs** replaced the C-based conmon starting in **OCP 4.14**. conmon-rs uses gRPC (defined in `proto/conmon.proto`) instead of pipe-based IPC. + +## Binary Paths on RHCOS + +| Binary | Path | +|--------|------| +| crun | `/usr/bin/crun` | +| conmonrs | `/usr/libexec/crio/conmonrs` | +| conmon (legacy) | `/usr/bin/conmon` | + +After replacing any of these binaries on a node, **restart CRI-O** (`sudo systemctl restart crio`) for it to pick up the change. + +## Build Notes + +- **crun**: Fully static builds with glibc are not recommended. Use musl libc for true static builds, or use dynamic linking matching RHCOS library versions. A containerized build (Fedora/UBI) is the easiest path for cross-compilation. +- **conmon-rs**: Use `cross` (cargo plugin) for cross-compilation rather than native cross-compilation toolchains — avoids linker issues. diff --git a/plugins/node-team/skills/node/references/development/kubelet-dev.md b/plugins/node-team/skills/node/references/development/kubelet-dev.md new file mode 100644 index 000000000..cd77ff34e --- /dev/null +++ b/plugins/node-team/skills/node/references/development/kubelet-dev.md @@ -0,0 +1,22 @@ +# Kubelet (OpenShift): Non-Obvious Notes (Tribal Knowledge) + +- **Upstream**: `https://github.com/kubernetes/kubernetes.git` +- **Downstream (OpenShift)**: `https://github.com/openshift/kubernetes.git` + +For build commands, repo layout, and test targets — browse the repo directly (Makefile, README, go.mod, openshift-hack/). + +## Carry Patch Conventions + +OpenShift maintains patches on top of upstream kubernetes. Commits use these prefixes: + +- `UPSTREAM: :` — OpenShift-specific patch, not intended for upstream +- `UPSTREAM: :` — merge commit from upstream rebase +- `UPSTREAM: 12345:` — cherry-pick of upstream PR #12345 + +These are in the `UPSTREAM/` directory and as commit prefixes in git history. + +## OpenShift-Specific + +- The kubelet binary ships inside the **`ose-hyperkube`** image in OCP. +- Kubelet configuration on RHCOS is rendered at `/etc/kubernetes/kubelet.conf`, managed by MCO. Customize via `KubeletConfig` CR, not by editing the file. +- For active development, work against `master` unless backporting a fix to `release-4.X`. diff --git a/plugins/node-team/skills/node/references/development/kueue-operator-dev.md b/plugins/node-team/skills/node/references/development/kueue-operator-dev.md new file mode 100644 index 000000000..6acdc65f5 --- /dev/null +++ b/plugins/node-team/skills/node/references/development/kueue-operator-dev.md @@ -0,0 +1,17 @@ +# OpenShift Kueue Operator: Non-Obvious Notes (Tribal Knowledge) + +- **Upstream Kueue**: `https://github.com/kubernetes-sigs/kueue.git` +- **Downstream Operator**: `https://github.com/openshift/kueue-operator.git` + +For build commands, repo layout, CRD types, and test targets — browse the repo directly (Makefile, README, go.mod, api/). + +## Architecture + +The operator manages the lifecycle of upstream Kueue on OpenShift. It deploys into `openshift-kueue-operator` namespace and creates the upstream `kueue-controller-manager` in the `kueue-system` namespace. + +When changes are needed in upstream Kueue itself, submit a PR to `kubernetes-sigs/kueue` first, then update the operator to consume the new version. + +## OpenShift-Specific + +- Built with operator-sdk framework (controller-runtime, controller-gen, OLM bundles). +- CVO override warning applies here too — scale down CVO if patching the operator deployment manually during development. diff --git a/plugins/node-team/skills/node/references/development/mco-dev.md b/plugins/node-team/skills/node/references/development/mco-dev.md new file mode 100644 index 000000000..4a3c1d43a --- /dev/null +++ b/plugins/node-team/skills/node/references/development/mco-dev.md @@ -0,0 +1,40 @@ +# MCO (Machine Config Operator): Non-Obvious Notes (Tribal Knowledge) + +- **Repo**: `https://github.com/openshift/machine-config-operator.git` (no upstream — MCO is OpenShift-only) + +For build commands, repo layout, CRD types, and test targets — browse the repo directly (Makefile, README, go.mod, pkg/apis/). + +## Rendering Pipeline + +MachineConfigs are sorted **lexicographically by name** before merging. This is why naming conventions matter (e.g., `00-worker`, `01-worker-custom`). Later configs override earlier ones for files (by path) and systemd units (by name). Kernel arguments and extensions are accumulated (union). + +## MCD Reboot Rules + +The MCD **does** trigger a reboot when: +- Files in `/etc` or `/usr` change +- Systemd units are added/removed/modified +- Kernel arguments or OS extensions change +- The OS image changes + +The MCD **does not** reboot when: +- Only SSH keys are updated +- Only node annotations change + +## CVO Override Warning + +The Cluster Version Operator (CVO) will **revert manual image overrides** on MCO deployments. During development, scale down CVO: + +```bash +oc scale deployment cluster-version-operator -n openshift-cluster-version --replicas=0 +``` + +Remember to scale it back when done. + +## On-Cluster Layering (OCP 4.13+) + +On-cluster layering builds custom OS images using `MachineOSConfig` and `MachineOSBuild` resources. The MCD applies layered images via `rpm-ostree rebase` or `bootc switch`. + +## Other Notes + +- MCP `maxUnavailable` defaults to 1 — nodes update one at a time. +- Machine Config Server (MCS) serves Ignition configs on port **22623**. diff --git a/plugins/node-team/skills/node/references/development/worktrees.md b/plugins/node-team/skills/node/references/development/worktrees.md new file mode 100644 index 000000000..d0c669757 --- /dev/null +++ b/plugins/node-team/skills/node/references/development/worktrees.md @@ -0,0 +1,53 @@ +# Worktrees: Parallel Multi-Repo Workspaces + +Create isolated workspaces using `git worktree` with a `wt/` branch under `.worktrees//`. When submodules are present, each one gets its own worktree and branch inside the workspace. + +## Create a Workspace + +```bash +# Sync submodules first +git fetch --quiet origin +git submodule update --init --quiet +git submodule foreach --quiet 'git fetch --quiet origin; git checkout main --quiet 2>/dev/null; git merge --ff-only origin/main --quiet 2>/dev/null || true' + +# Create root worktree +git worktree add .worktrees/ -b wt/ HEAD + +# Create submodule worktrees +git submodule foreach --quiet 'git worktree add "$toplevel/.worktrees//$sm_path" -b "wt/" HEAD' + +cd .worktrees// +``` + +## Merge Back + +```bash +# For each submodule: merge wt/ into main +git submodule foreach --quiet ' + git checkout main --quiet + git merge --ff-only wt/ --quiet 2>/dev/null || git merge wt/ --no-edit --quiet +' + +# Merge root +git checkout main +git merge --ff-only wt/ --quiet 2>/dev/null || git merge wt/ --no-edit --quiet + +# Update submodule pointers +git add -A && git diff --cached --quiet || git commit -m "Merge workspace " +``` + +## Remove + +```bash +git submodule foreach --quiet 'git worktree remove --force "$toplevel/.worktrees//$sm_path" 2>/dev/null; git branch -D "wt/" 2>/dev/null' +git worktree remove --force .worktrees/ +git branch -D wt/ +``` + +## Non-Obvious Details + +- **Branch prefix is `wt/`** — every workspace creates `wt/` branches in the root and all submodules. Don't manually create branches with this prefix. +- **Always sync submodules before branching** — fetch and fast-forward all submodules to their tracked branch so your workspace starts from the latest remote state. +- **Remote agent pushes** — if an agent pushed commits to `origin/wt/`, fetch and merge them before merging into main: `git fetch origin; git merge origin/wt/`. +- **Reconcile submodule pointers after merge** — ensure each submodule's main matches the commit the root repo expects. Prevents pointer drift. +- **Only fast-forward during sync** — never rebase or create merge commits during sync. If a submodule has diverged, warn and skip. diff --git a/plugins/node-team/skills/node/references/jira.md b/plugins/node-team/skills/node/references/jira.md new file mode 100644 index 000000000..16ac41bdb --- /dev/null +++ b/plugins/node-team/skills/node/references/jira.md @@ -0,0 +1,188 @@ +# Node Team Jira Reference + +Red Hat Jira: `redhat.atlassian.net`. REST API v3. Use `curl` directly. + +## Authentication + +API token from env or macOS Keychain: + +```bash +JIRA_API_TOKEN="${JIRA_API_TOKEN:-$(security find-generic-password -s "JIRA_API_TOKEN" -w 2>/dev/null)}" +JIRA_USER="${JIRA_EMAIL:-$(security find-generic-password -s "JIRA_API_TOKEN" -g 2>&1 | grep acct | sed 's/.*="//;s/"//')}" +[[ "$JIRA_USER" != *@* ]] && JIRA_USER="${JIRA_USER}@redhat.com" +: "${JIRA_USER:=$(git config user.email)}" +``` + +All requests: `curl -s -u "$JIRA_USER:$JIRA_API_TOKEN" -H "Content-Type: application/json"`. + +## REST API Endpoints + +Base: `https://redhat.atlassian.net` + +| Method | Path | Use | +|--------|------|-----| +| POST | `/rest/api/3/search/jql` | Search. Body: `{"jql":"...","maxResults":50,"fields":["key","summary",...]}` | +| GET | `/rest/api/3/issue/{key}` | Get issue. Optional `?fields=summary,status,...` | +| POST | `/rest/api/3/issue` | Create. Body: `{"fields":{"project":{"key":"OCPNODE"},"issuetype":{"name":"Story"},"summary":"..."}}` | +| PUT | `/rest/api/3/issue/{key}` | Update fields. Body: `{"fields":{"customfield_10028":5}}` | +| PUT | `/rest/api/3/issue/{key}/assignee` | Assign. Body: `{"accountId":"..."}` | +| GET | `/rest/api/3/issue/{key}/comment` | List comments | +| POST | `/rest/api/3/issue/{key}/comment` | Add comment (body in ADF format, see below) | +| GET | `/rest/api/3/issue/{key}/transitions` | Available transitions | +| POST | `/rest/api/3/issue/{key}/transitions` | Transition. Body: `{"transition":{"id":"31"}}` | +| POST | `/rest/api/3/issue/{key}/remotelink` | Add link. Body: `{"object":{"url":"...","title":"..."}}` | +| GET | `/rest/api/3/user/search?query={name}` | Find user by name | +| GET | `/rest/agile/1.0/board/7845/sprint?state=active` | List sprints (board 7845 = Node) | +| GET | `/rest/agile/1.0/sprint/{id}/issue?maxResults=100&fields=...` | Sprint issues | +| POST | `/rest/agile/1.0/sprint/{id}/issue` | Move to sprint. Body: `{"issues":["KEY-1","KEY-2"]}` | + +## ADF (Atlassian Document Format) + +Jira Cloud uses ADF for rich text fields (description, comments, blocked reason). When **posting** comments or creating issues with descriptions: + +```json +{ + "body": { + "version": 1, + "type": "doc", + "content": [{"type": "paragraph", "content": [{"type": "text", "text": "Your text here"}]}] + } +} +``` + +When **reading** ADF from responses: recursively walk `content` arrays, extract `text` from `type: "text"` nodes. Handle: `marks` with `type: "link"` (append URL), `type: "mention"` (extract `attrs.text`), `type: "blockCard"/"inlineCard"` (extract `attrs.url`). Paragraphs, headings, list items end with newlines. + +## Projects + +| Project | Tracks | +|---------|--------| +| OCPNODE | Node team epics, stories, tasks, spikes | +| OCPBUGS | Cross-team bugs (filter by Node components) | +| RHOCPPRIO | Red Hat OpenShift Priority List (escalations) | +| OCPKUEUE | Kueue-specific work | +| OCPSTRAT | Strategy/feature tracking | + +## Components We Own + +Node, Node / CRI-O, Node / Kubelet, Node / CPU manager, Node / Memory manager, Node / Topology manager, Node / Numa aware Scheduling, Node / Device Manager, Node / Pod resource API, Node / Node Problem Detector, Node / Kueue, Node / Instaslice-operator + +## Boards & Sprints + +| ID | Board | +|----|-------| +| 7845 | Node board (scrum) | +| 4383 | Node-Epics (kanban) | +| 9874 | Node QE (scrum) | + +Sprint naming: `OCP Node Core Sprint N`, `OCP Node Devices Sprint N`, `OCP Kueue Sprint N`, `CNF Compute Sprint N` + +Filter sprints to Node-related by checking if `"Node"` or `"Kueue"` appears in the sprint name. + +Team queue: `aos-node@redhat.com` + +## Team Roster + +Team member lists live in `~/.node-assistant/team-roster-{core,dra}.json`. Format: + +```json +{ + "description": "Node Core team members", + "members": { + "Jira Display Name": "github-handle", + "Another Person": "their-github-handle" + } +} +``` + +Use these to resolve display names for assignment, filter team activity, and exclude external CVE assignees. + +Bot account treated as unassigned: `Node Team Bot Account`. + +## Sub-teams + +| Team | Sprint filter | Roster file | Bug components | +|------|--------------|-------------|----------------| +| Core | `Node Core` | `team-roster-core.json` | All Node components | +| DRA/Devices | `Node Devices` | `team-roster-dra.json` | Node / Device Manager, Node / Instaslice-operator | + +## Custom Field IDs + +Use field names in JQL, IDs in REST API calls: + +| ID | Name | Notes | +|----|------|-------| +| `customfield_10014` | Epic Link | String key, e.g. `"OCPNODE-1234"` | +| `customfield_10011` | Epic Name | | +| `customfield_10020` | Sprint | Array of objects with `state` field (`active`/`closed`/`future`) | +| `customfield_10028` | Story Points | Number | +| `customfield_10001` | Team | | +| `customfield_10855` | Target Version | | +| `customfield_10840` | Severity | Object: `{"value": "Critical"}` | +| `customfield_10847` | Release Blocker | Object: `{"value": "Approved"}` or `{"value": "Proposed"}` | +| `customfield_10517` | Blocked | Object: `{"value": "True"}` or `{"value": "False"}` | +| `customfield_10483` | Blocked Reason | ADF document | +| `customfield_10978` | SFDC Cases Counter | Number | +| `customfield_10979` | SFDC Cases Links | | + +## Saved Filters + +Use in JQL via `filter = "Name"`: + +| Name | ID | Scope | +|------|-----|-------| +| Node Components | 91645 | Component list | +| Node Bugs | 83963 | Node component bugs | +| Node Core Team | 66331 | Core team members | +| Node Green Team | 89708 | Green team assignees | +| Node Blue Team | 64253 | Blue team assignees | +| Node Epics | 96318 | OCPNODE epics | +| Node CR bugs | 94401 | Component regression bugs | + +## Workflow Statuses + +Bug lifecycle: NEW → To Do → ASSIGNED → POST → Modified → ON_QA → Verified → CLOSED/Done + +Feature/epic: New → Planning → To Do → In Progress → Code Review → Review → Dev Complete → Done/Closed + +Status grouping for dashboards: map `statusCategory` key `"done"` → done, status name `"Code Review"` → codeReview, `"MODIFIED"` → modified, `statusCategory` `"indeterminate"` → inProgress, `statusCategory` `"new"` → toDo, else → other. + +## Key Field Meanings + +| Field Value | Meaning | +|-------------|---------| +| Priority: Undefined | Untriaged — needs prioritization | +| Release Blocker: Proposed | Someone thinks this blocks the release | +| Release Blocker: Approved | Confirmed release blocker | +| SFDC Cases Counter (not empty) | Has linked support cases | + +## Bug Triage Definitions + +Base all queries on `filter = "Node Bugs"` and append: + +| Category | JQL Clause | +|----------|-----------| +| Untriaged | `priority = Undefined OR "Release Blocker" = Proposed OR assignee in ("aos-node@redhat.com")` | +| Blocker? | `"Release Blocker" = Proposed OR priority = Blocker AND "Release Blocker" is EMPTY` | +| Blocker+ | `"Release Blocker" = Approved OR priority = Blocker` | +| Customer Issues | `"Customer Impact" = "Customer Escalated" OR "SFDC Cases Counter" is not EMPTY` | +| CVE | `labels in (SecurityTracking) OR issuetype in (Vulnerability, Weakness)` | +| CR | `labels = component-regression` | + +## Carryover Detection + +Count closed sprints in `customfield_10020` array to detect carryovers: +``` +sprints_carried = count of items in customfield_10020 where state == "closed" +``` + +## External CVE Filtering + +Exclude from bug counts: bugs with "CVE" in summary AND status "ASSIGNED" AND assignee not in team roster AND assignee != "Unassigned". These are handled by other teams. + +## Gotchas + +- Epic children: use `"Epic Link" = EPIC-KEY` in JQL (not `parentEpic`). +- `issueFunction` does **not exist** on Jira Cloud. Workaround: `watcher = currentUser() AND comment ~ "keyword"`. +- Always confirm with the user before any write operation (create, edit, comment, transition). +- Release Blocker and Blocked fields are objects (`{"value":"True"}`), not strings. Check shape before accessing `.value`. +- When listing sprints, filter to Node-relevant by checking if sprint name contains "Node" or "Kueue", then sort by `startDate` descending. diff --git a/plugins/node-team/skills/node/references/platform-docs.md b/plugins/node-team/skills/node/references/platform-docs.md new file mode 100644 index 000000000..c01e44a01 --- /dev/null +++ b/plugins/node-team/skills/node/references/platform-docs.md @@ -0,0 +1,23 @@ +# Platform Documentation Lookup + +Prefer retrieval over pre-training for Kubernetes and OpenShift specifics — docs change across versions. Use `gh api` to fetch raw markdown. + +## Kubernetes + +- Repo: `kubernetes/website`, path: `content/en/docs/` +- Versioning: git branches named `release-X.Y` — discover latest by listing branches, grep `^release-`, sort, take last +- No index file — navigate by listing directories +- Hugo shortcodes (`{{< ... >}}`) appear in content — ignore them +- Always include `?ref=$VERSION` in API calls + +## OpenShift + +- Repo: `harche/openshift-docs-md`, path: `docs/{version}/` +- Versions are directories (e.g. `4.22`) — discover latest by listing `docs/`, filter numeric names, take highest +- Each version has an **`AGENTS.md`** index mapping topics to doc files — always start here + +## Common + +- Always use `-H "Accept: application/vnd.github.raw+json"` for raw file content +- Discover versions dynamically — never hardcode +- Read-only diff --git a/plugins/node-team/skills/node/references/prometheus.md b/plugins/node-team/skills/node/references/prometheus.md new file mode 100644 index 000000000..f8bd940e4 --- /dev/null +++ b/plugins/node-team/skills/node/references/prometheus.md @@ -0,0 +1,35 @@ +# Prometheus on OpenShift/Kubernetes + +Query cluster metrics using `promtool`. Install: `brew install prometheus`. + +## Critical Rules + +These caused real failures — follow exactly. + +1. **Run setup + queries in a single bash call.** Shell variables (`$PROM_URL`, `$HTTP_CONFIG`, `$TOKEN`) don't persist across separate bash invocations. Combine with `&&`. + +2. **Never use `!=` in PromQL.** Zsh mangles `!=` into `\!=` via history expansion, even inside single quotes. Use `=~".+"` instead of `!=""`, and negated regex instead of `!=`. + +3. **JSON output is a raw array.** `promtool -o json` outputs `[{metric:{...}, value:[ts, val]}, ...]` — NOT `{data:{result:...}}`. Parse with `jq '.[]'`, not `jq '.data.result[]'`. + +4. **`oc whoami -t` may return empty AND exit non-zero.** Client-cert kubeconfigs have no session token. Always: `TOKEN=$(oc whoami -t 2>/dev/null || true)`, then check if empty and fall back to creating a service account token. + +5. **`promtool check healthy/ready` returns 503 on Thanos Querier.** Expected — Thanos doesn't expose `/-/healthy`. Test with `promtool query instant ... 'up'` instead. + +6. **Clean up temp files.** Always `rm -f "$HTTP_CONFIG"` and `kill $PF_PID 2>/dev/null` after queries. + +## OpenShift Setup Pattern + +All in one bash call: + +1. Get token: `oc whoami -t` or create SA `prometheus-reader` in `openshift-monitoring` with `cluster-monitoring-view` role, then `oc create token` +2. Get Thanos route: `oc -n openshift-monitoring get route thanos-querier -o jsonpath='{.status.ingress[].host}'` +3. Write HTTP config to temp file (Bearer token + `insecure_skip_verify: true`) +4. Run queries +5. Clean up temp file + +For vanilla Kubernetes: find the Prometheus service (`kubectl get svc -A | grep prometheus`), port-forward to 9090, no auth usually needed. + +## Cross-Platform Date + +macOS and Linux `date` differ. Use: `date -u -d '1 hour ago' +FMT 2>/dev/null || date -u -v-1H +FMT` diff --git a/plugins/node-team/skills/node/references/support.md b/plugins/node-team/skills/node/references/support.md new file mode 100644 index 000000000..95c9a0986 --- /dev/null +++ b/plugins/node-team/skills/node/references/support.md @@ -0,0 +1,34 @@ +# Red Hat Support: Knowledge Base & Cases + +## Authentication + +Both APIs use OAuth Bearer tokens. Get the offline token from keychain, exchange for access token: + +- Keychain key: `RH_API_OFFLINE_TOKEN` (macOS: `security find-generic-password -a "$USER" -s "RH_API_OFFLINE_TOKEN" -w`, Linux: `secret-tool lookup service redhat key RH_API_OFFLINE_TOKEN`) +- Token exchange: `POST https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token` with `grant_type=refresh_token`, `client_id=rhsm-api`, `refresh_token=$OFFLINE_TOKEN` → extract `access_token` from response + +Always get a fresh token before each session. + +## Knowledge Base + +Endpoint: `GET https://access.redhat.com/hydra/rest/search/kcs` + +Key params: `q` (search terms, `+` joins), `rows`, `start` (pagination offset), `fq` (filter), `fl` (field list), `sort`. + +Useful `fq` filters: `documentKind:Solution`, `id:7087003` (fetch by ID), `boostProduct:openshift`. + +Solution-specific field names (Solr): `solution_resolution`, `solution_rootcause`, `solution_environment`, `solution_diagnosticsteps`, `issue`, `caseCount`. + +URL parsing: `https://access.redhat.com/solutions/7087003` → extract `7087003`, fetch with `fq=id:7087003`. + +## Support Cases + +Endpoint: `https://api.access.redhat.com/support/v1/cases/{caseNumber}` + +Comments: `GET .../comments`, Attachments: `GET .../attachments`, Search: `POST .../filter` with JSON body (`maxResults`, `offset`, `keyword`, `status`, `product`, `startDate`, `endDate`). + +Statuses: `Waiting on Red Hat`, `Waiting on Customer`, `Closed`. Severities: `1 (Urgent)` = production down, `2 (High)`, `3 (Normal)`, `4 (Low)`. + +URL parsing: `https://access.redhat.com/support/cases/#/case/04378910` → extract `04378910`. + +When Jira bugs have SFDC case links (`customfield_12313441` or `customfield_10979`), look up each referenced case number.