From a3a6d6bb7e9b8f9a8d632cdf5b45011ebe4a7136 Mon Sep 17 00:00:00 2001 From: Biagio Manzari Date: Mon, 30 Dec 2024 10:33:54 +0100 Subject: [PATCH] [ Staged Rollout ] - Enable OP in ABI static-3-plus-1 jobs --- .../baremetal-lab-agent-install-commands.sh | 16 + ...tall-network-static-3-plus-1-workflow.yaml | 3 + .../baremetal-lab-ipi-install-commands.sh | 6 + .../baremetal-lab-post-firewall-commands.sh | 2 +- .../baremetal-lab-pre-firewall-commands.sh | 3 +- .../baremetal-lab-upi-install-commands.sh | 6 + .../observers/ocp-baremetal-qe/OWNERS | 10 + .../observers-ocp-baremetal-qe-commands.sh | 578 ++++++++++++++++++ ...rs-ocp-baremetal-qe-observer.metadata.json | 17 + .../observers-ocp-baremetal-qe-observer.yaml | 15 + 10 files changed, 654 insertions(+), 2 deletions(-) create mode 100644 ci-operator/step-registry/observers/ocp-baremetal-qe/OWNERS create mode 100755 ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-commands.sh create mode 100644 ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.metadata.json create mode 100644 ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.yaml diff --git a/ci-operator/step-registry/baremetal/lab/agent/install/baremetal-lab-agent-install-commands.sh b/ci-operator/step-registry/baremetal/lab/agent/install/baremetal-lab-agent-install-commands.sh index 05e6607505011..eb9b44710e879 100644 --- a/ci-operator/step-registry/baremetal/lab/agent/install/baremetal-lab-agent-install-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/agent/install/baremetal-lab-agent-install-commands.sh @@ -230,6 +230,13 @@ cp "${INSTALL_DIR}/auth/kubeconfig" "${SHARED_DIR}/" cp "${INSTALL_DIR}/auth/kubeadmin-password" "${SHARED_DIR}/" scp "${SSHOPTS[@]}" "${INSTALL_DIR}"/auth/* "root@${AUX_HOST}:/var/builds/${CLUSTER_NAME}/" +# Copy coreos stream file so the observer pod can check if the correct live image was booted +echo -e "\nGenerating coreOS stream file..." + +# Creating file straight into $SHARED_DIR is not 100% reliable because of propagation issues (author guessing) +oinst coreos print-stream-json > "${INSTALL_DIR}/coreos-stream.json" +scp "${SSHOPTS[@]}" "${INSTALL_DIR}/coreos-stream.json" "root@${AUX_HOST}:/var/builds/${CLUSTER_NAME}/coreos-stream.json" + proxy="$(<"${CLUSTER_PROFILE_DIR}/proxy")" # shellcheck disable=SC2154 for bmhost in $(yq e -o=j -I=0 '.[]' "${SHARED_DIR}/hosts.yaml"); do @@ -256,6 +263,9 @@ echo "Launching 'wait-for bootstrap-complete' installation step....." http_proxy="${proxy}" https_proxy="${proxy}" HTTP_PROXY="${proxy}" HTTPS_PROXY="${proxy}" \ oinst agent wait-for bootstrap-complete 2>&1 & if ! wait $!; then + # Used by observer pod + touch "${SHARED_DIR}/failure" + # TODO: gather logs?? echo "ERROR: Bootstrap failed. Aborting execution." exit 1 fi @@ -266,9 +276,15 @@ http_proxy="${proxy}" https_proxy="${proxy}" HTTP_PROXY="${proxy}" HTTPS_PROXY=" oinst agent wait-for install-complete & if ! wait "$!"; then echo "ERROR: Installation failed. Aborting execution." + # Used by observer pod + touch "${SHARED_DIR}/failure" + # TODO: gather logs?? exit 1 fi echo "Ensure that all the cluster operators remain stable and ready until OCPBUGS-18658 is fixed." oc adm wait-for-stable-cluster --minimum-stable-period=1m --timeout=15m update_image_registry + +# Used by observer pod +touch "${SHARED_DIR}/success" \ No newline at end of file diff --git a/ci-operator/step-registry/baremetal/lab/agent/install/network/static/3-plus-1/baremetal-lab-agent-install-network-static-3-plus-1-workflow.yaml b/ci-operator/step-registry/baremetal/lab/agent/install/network/static/3-plus-1/baremetal-lab-agent-install-network-static-3-plus-1-workflow.yaml index f533aa92b72f5..910c04a71021e 100644 --- a/ci-operator/step-registry/baremetal/lab/agent/install/network/static/3-plus-1/baremetal-lab-agent-install-network-static-3-plus-1-workflow.yaml +++ b/ci-operator/step-registry/baremetal/lab/agent/install/network/static/3-plus-1/baremetal-lab-agent-install-network-static-3-plus-1-workflow.yaml @@ -7,6 +7,9 @@ workflow: - chain: baremetal-lab-agent-install post: - chain: baremetal-lab-post + observers: + enable: + - observers-ocp-baremetal-qe env: ipv4_enabled: "true" ipv6_enabled: "false" diff --git a/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh b/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh index f8eab33000862..35ec546afc706 100644 --- a/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh @@ -219,6 +219,10 @@ cp "${INSTALL_DIR}/auth/kubeconfig" "${SHARED_DIR}/" cp "${INSTALL_DIR}/auth/kubeadmin-password" "${SHARED_DIR}/" scp "${SSHOPTS[@]}" "${INSTALL_DIR}"/auth/* "root@${AUX_HOST}:/var/builds/${CLUSTER_NAME}/" +# Creating file straight into $SHARED_DIR is not 100% reliable because of propagation issues (author guessing) +oinst coreos print-stream-json > "${INSTALL_DIR}/coreos-stream.json" +cp "${INSTALL_DIR}/coreos-stream.json" "${SHARED_DIR}/" + date "+%F %X" > "${SHARED_DIR}/CLUSTER_INSTALL_START_TIME" # The installer's terraform template using the ironic provider needs to reach the ironic endpoint in the bootstrap VM @@ -237,6 +241,8 @@ echo -e "\n[INFO] Launching 'wait-for install-complete' installation step again. oinst wait-for install-complete & if ! wait "$!"; then echo "ERROR: Installation failed. Aborting execution." + # Used by observer pod + touch "${SHARED_DIR}/failure" exit 1 fi update_image_registry diff --git a/ci-operator/step-registry/baremetal/lab/post/firewall/baremetal-lab-post-firewall-commands.sh b/ci-operator/step-registry/baremetal/lab/post/firewall/baremetal-lab-post-firewall-commands.sh index 1941664e37ddd..563c7a4afc2ec 100644 --- a/ci-operator/step-registry/baremetal/lab/post/firewall/baremetal-lab-post-firewall-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/post/firewall/baremetal-lab-post-firewall-commands.sh @@ -41,6 +41,6 @@ timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" bash -s -- \ INTERNAL_NET_CIDR="${1}" IP_ARRAY="${@:2}" for ip in $IP_ARRAY; do - iptables -D FORWARD -s ${ip} ! -d "${INTERNAL_NET_CIDR}" -j DROP + iptables -D FORWARD -s ${ip} ! -d "${INTERNAL_NET_CIDR}" ! -p tcp --dport 22 -j DROP done EOF \ No newline at end of file diff --git a/ci-operator/step-registry/baremetal/lab/pre/firewall/baremetal-lab-pre-firewall-commands.sh b/ci-operator/step-registry/baremetal/lab/pre/firewall/baremetal-lab-pre-firewall-commands.sh index 8c3399f0df428..6213301f85687 100644 --- a/ci-operator/step-registry/baremetal/lab/pre/firewall/baremetal-lab-pre-firewall-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/pre/firewall/baremetal-lab-pre-firewall-commands.sh @@ -56,7 +56,8 @@ timeout -s 9 10m ssh "${SSHOPTS[@]}" "root@${AUX_HOST}" bash -s -- \ IP_ARRAY="${@:2}" for ip in $IP_ARRAY; do # TODO: change to firewalld or nftables - iptables -A FORWARD -s ${ip} ! -d "${INTERNAL_NET_CIDR}" -j DROP + # Allow connections on port 22 used by observer pod + iptables -A FORWARD -s ${ip} ! -d "${INTERNAL_NET_CIDR}" ! -p tcp --dport 22 -j DROP done EOF diff --git a/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh b/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh index 74ab7c7685c21..2cedacd1bc574 100644 --- a/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh @@ -314,6 +314,10 @@ cp "${INSTALL_DIR}/metadata.json" "${SHARED_DIR}/" cp "${INSTALL_DIR}/auth/kubeconfig" "${SHARED_DIR}/" cp "${INSTALL_DIR}/auth/kubeadmin-password" "${SHARED_DIR}/" +# Creating file straight into $SHARED_DIR is not 100% reliable because of propagation issues (author guessing) +oinst coreos print-stream-json > "${INSTALL_DIR}/coreos-stream.json" +cp "${INSTALL_DIR}/coreos-stream.json" "${SHARED_DIR}/" + echo -e "\nPower on the hosts..." # shellcheck disable=SC2154 for bmhost in $(yq e -o=j -I=0 '.[]' "${SHARED_DIR}/hosts.yaml"); do @@ -350,6 +354,8 @@ echo -e "\nLaunching 'wait-for install-complete' installation step....." oinst wait-for install-complete & if ! wait "$!"; then echo "ERROR: Installation failed. Aborting execution." + # Used by observer pod + touch "${SHARED_DIR}/failure" # TODO exit 1 fi diff --git a/ci-operator/step-registry/observers/ocp-baremetal-qe/OWNERS b/ci-operator/step-registry/observers/ocp-baremetal-qe/OWNERS new file mode 100644 index 0000000000000..c223fd95ea275 --- /dev/null +++ b/ci-operator/step-registry/observers/ocp-baremetal-qe/OWNERS @@ -0,0 +1,10 @@ +approvers: +- aleskandro +- pamoedom +- bmanzari +reviewers: +- aleskandro +- jadhaj +- bmanzari +- pamoedom +- sgoveas diff --git a/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-commands.sh b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-commands.sh new file mode 100755 index 0000000000000..c16912195e233 --- /dev/null +++ b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-commands.sh @@ -0,0 +1,578 @@ +#!/bin/bash + +# Suppress shellcheck warning for 14+ instances of '. <(echo "$bmhost"' +# 'var is referenced but not assigned.' as a consequence of using '. <' +# shellcheck disable=SC1090 +# shellcheck disable=SC2154 + +set -o nounset + +SSHOPTS=(-o 'ConnectTimeout=5' + -o 'StrictHostKeyChecking=no' + -o 'UserKnownHostsFile=/dev/null' + -o 'ServerAliveInterval=90' + -o LogLevel=ERROR + -i "${CLUSTER_PROFILE_DIR}/ssh-key") + +AUX_HOST="openshift-qe-metal-ci.arm.eng.rdu2.redhat.com" + +# IPI/UPI - MCO takes way longer than ABI to come up, 10 minutes wait not enough +MAX_RETRY=15 +NODE_ALIVE_SLEEP=60 + + +# If I use $SHARED_DIR in this script, at runtime it resolves to /tmp/secret/ +# even though from pod terminal it has correct value of /var/run/secrets/ci.openshift.io/multi-stage/ + +HOSTS_FILE="/var/run/secrets/ci.openshift.io/multi-stage/hosts.yaml" + +COREOS_STREAM_FILE="/tmp/coreos-stream.json" + +INSTALL_SUCCESS_FILE="/var/run/secrets/ci.openshift.io/multi-stage/success" +INSTALL_FAILURE_FILE="/var/run/secrets/ci.openshift.io/multi-stage/failure" + + +# https://docs.ci.openshift.org/docs/internals/observer-pods/ + +NODE_STARTUP="Node startup" +NODE_BOOTED_IMAGE="Node booted image" +NODE_REBOOTED="Node rebooted" +NODE_BOOTED_DISK="Node booted disk" +NODE_INSTALLING="Node is installing" +NODE_REBOOTING="Node is rebooting" + +NODE_IS_REACHABLE="Node is reachable" +NODE_IS_UNREACHABLE="Node is unreachable" + +NODE_IS_UNRECOVERABLE="Node is unrecoverable" + +INSTALL_COMPLETE="Install completed" + +EXIT_CODE_UNREACHABLE=10 +EXIT_CODE_WRONG_VERSION=20 +EXIT_CODE_COREOS_NOT_FOUND=30 + +IS_PXE_JOB=false + +FSM_FILE_PREFIX="/tmp/fsm_" + +function writeFSMFile(){ + local host="${1}" + local message="${2}" + local fileToWrite=${FSM_FILE_PREFIX}${host} + flock -x -w 5 $fileToWrite echo $message > $fileToWrite +} + +function handleUnreachableNode(){ + local bmhost="${1}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + if [ "$(grep -P "(?=.*?$host)(?=.*?$EXIT_CODE_UNREACHABLE)" "${ARTIFACT_DIR}/node-status.txt")" != 0 ]; then + echo "Host has already been rebooted once, exiting" + writeFSMFile $host "${NODE_IS_UNRECOVERABLE}" + else + echo "Host ${ip} not alive, rebooting..." + boot_from="cdrom" + if [[ $IS_PXE_JOB = true ]]; then + boot_from="pxe" + fi + reset_node "${bmhost}" "${boot_from}" & + echo "${host} $EXIT_CODE_UNREACHABLE" >> "${ARTIFACT_DIR}/node-status.txt" + isNodeAlive "${bmhost}" & + fi +} + +function handleWrongVersionBooted(){ + local bmhost="${1}" + echo "Host has booted wrong version" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "$EXIT_CODE_WRONG_VERSION" "${host} ${name}" >> "${ARTIFACT_DIR}/install-status.txt" + kill -1 $$ +} + +function handleOSNotFound(){ + local bmhost="${1}" + echo "Base operating system not found" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "$EXIT_CODE_COREOS_NOT_FOUND" "${host} ${name}" >> "${ARTIFACT_DIR}/install-status.txt" + kill -1 $$ +} + + +function handleNode(){ + local bmhost="${1}" + local TRAP_EXIT_CODE="${2}" + echo "handling node after event $TRAP_EXIT_CODE" + case $TRAP_EXIT_CODE in + "$EXIT_CODE_UNREACHABLE") + handleUnreachableNode $bmhost + ;; + "$EXIT_CODE_WRONG_VERSION") + handleWrongVersionBooted $bmhost + ;; + "$EXIT_CODE_COREOS_NOT_FOUND") + handleOSNotFound $bmhost + ;; + *) + exit 0 + ;; + esac +} + +function reset_node(){ + local bmhost="${1}" + local boot_from="${2}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "Setting boot device for host $host to : $boot_from" + ipmitool -I lanplus -H "${AUX_HOST}" -p "${bmc_forwarded_port}" -U "$bmc_user" -P "$bmc_pass" chassis bootdev "$boot_from" options=efiboot + echo "Rebooting $host..." + ipmitool -I lanplus -H "${AUX_HOST}" -p "${bmc_forwarded_port}" -U "$bmc_user" -P "$bmc_pass" power cycle +} + +# There might be multiple observer pods running, kill ONLY the processes spawned by THIS instance using unique identifiers such as $host or $bmc_address +function killPendingBastionProcesses(){ + echo "entering killPendingBastionProcesses" + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "vendor is: $vendor" + timeout -s 9 5m ssh -q "${SSHOPTS[@]}" -t "root@${AUX_HOST}" "pkill -f '$bmc_address'" || true; + if [[ $vendor == *"hpe"* ]]; then + timeout -s 9 5m ssh -q "${SSHOPTS[@]}" -t "root@${AUX_HOST}" "pkill -f '$host'" || true; + fi + done + # kill connections + pkill -f ${AUX_HOST} + echo "bastion processes killed" +} + + +# Create per-node report +function createInstallJunit() { + echo "Creating JUnit report" + if test -f "${ARTIFACT_DIR}/install-status.txt" + then + + input="${ARTIFACT_DIR}/install-status.txt" + while IFS= read -r line + do + + EXIT_CODE=$(echo "$line" | awk '{print $1}') + HOST=$(echo "$line" | awk '{print $2}') + HOSTNAME=$(echo "$line" | awk '{print $3}') + + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < + + + + +EOF + + if [ "$EXIT_CODE" == "$EXIT_CODE_UNREACHABLE" ] + then + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < + + Host #${HOST} ($HOSTNAME) should be reachable and respond on the SSH port + + +EOF + elif [ "$EXIT_CODE" == "$EXIT_CODE_WRONG_VERSION" ] + then + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < + + Host #${HOST} ($HOSTNAME) should be reachable and respond on the SSH port + + + Host #${HOST} ($HOSTNAME) should boot the expected live image + + + openshift cluster install failed overall + + +EOF + elif [ "$EXIT_CODE" == "$EXIT_CODE_COREOS_NOT_FOUND" ] + then + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < + + Host #${HOST} ($HOSTNAME) should be reachable and respond on the SSH port + + + Host #${HOST} ($HOSTNAME) should boot the expected live image + + + Host #${HOST} ($HOSTNAME) should boot the installed OS from Disk + + + openshift cluster install failed overall + + +EOF + else + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < + + Host #${HOST} ($HOSTNAME) should be reachable and respond on the SSH port + + + Host #${HOST} ($HOSTNAME) should boot the expected live image + + + Host #${HOST} ($HOSTNAME) should boot the installed OS from Disk + + + openshift cluster install succeeded + + +EOF + fi + cat >>"${ARTIFACT_DIR}/junit_install_${HOST}.xml" < +EOF + done < "$input" + echo "JUnit reports created, exiting" + fi +} + +function isNodeReachable(){ + # Check if node is reachable poking SSH port using netcat + local host="${1}" + ssh_port=$((12000 + $host)) + status=$(timeout 5s nc ${AUX_HOST} "${ssh_port}" || true;) + echo "isNodeReachable: $status" + if [[ $status == *"SSH"* ]]; then + echo $NODE_IS_REACHABLE + else + echo $NODE_IS_UNREACHABLE + fi +} + + +function isNodeAlive(){ + local bmhost="${1}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "Starting isNodeAlive for ${host}" + for i in $(seq 1 $MAX_RETRY); do + printf "%s: Checking SSH connectivity for %s %s/${MAX_RETRY}\n" "$(date --utc --iso=s)" "${ip}" "${i}" + ssh_port=$((12000 + $host)) + status="$(timeout 5s nc ${AUX_HOST} "${ssh_port}" || true;)" + echo "isNodeReachable: $status" + if [[ $status == *"SSH"* ]]; then + writeFSMFile $host "${NODE_IS_REACHABLE}" + break + else + if [[ $i == $(($MAX_RETRY)) ]]; then + writeFSMFile $host "${EXIT_CODE_UNREACHABLE}" + else + echo "Node ${host} is not up yet or something is wrong, retrying" + sleep $NODE_ALIVE_SLEEP + fi + fi + done + + echo "Ending isNodeAlive for ${host}" +} + +function handleReboot(){ + # handle reboot only if happens during install (skip post-wipe) + if [ ! -f "${INSTALL_SUCCESS_FILE}" ]; then + local bmhost="${1}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + writeFSMFile $host "${NODE_REBOOTING}" + echo "host $host rebooted, waiting 30s for services shutdown..." + # Wait for sshd to shutdown completely, avoid immediate check when node has yet to reboot + sleep 30 + ssh_port=$((12000 + $host)) + status=$(timeout 5s nc ${AUX_HOST} "${ssh_port}" || true;) + until [[ $status == *"SSH"* ]]; do + echo "$host rebooting, please wait..." + sleep 30 + status=$(timeout 5s nc ${AUX_HOST} "${ssh_port}" || true;) + done + writeFSMFile $host "${NODE_REBOOTED}" + fi +} + +function journalRecord(){ + local bmhost="${1}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + ssh_port=$((12000 + $host)) + echo "journalctl host $host" + writeFSMFile $host "${NODE_INSTALLING}" + ssh "${SSHOPTS[@]}" -t -p "${ssh_port}" "core@${AUX_HOST}" << EOF > "${ARTIFACT_DIR}/${name}_${ip}_journalctl.txt" + journalctl -f | grep -E 'level=info|level=warning|level=error|level=fatal' & +EOF + # We can assume the host rebooted if the ssh connection gets closed by remote host + # Connection to openshift-qe-metal-ci.arm.eng.rdu2.redhat.com closed by remote host + trap 'handleReboot ${bmhost} &' EXIT +} + +function recordJournalctl(){ + echo "recordJournalctl" + # shellcheck disable=SC2154 + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + journalRecord $bmhost & + done +} + + +function checkBootedImage(){ + local whatToCheck="${1:-boot}" + local bmhost="${2}" + # shellcheck disable=SC2154 + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + ssh_port=$((12000 + $host)) + cmdline=$(timeout -s 9 5m ssh -q "${SSHOPTS[@]}" -t -p "${ssh_port}" "core@${AUX_HOST}" "cat /proc/cmdline" || true;) + echo $cmdline >> "${ARTIFACT_DIR}/cmdline_${host}.txt" + echo "checking $whatToCheck for host ${host}" + if [[ $whatToCheck == "boot" ]]; then + if [[ $IS_PXE_JOB = true ]]; then + # PXE BOOT + # BOOT_IMAGE=/ci-op-07q5cy9i/vmlinuz_x86_64 debug nosplash ip=eno8303:dhcp ip=eno12399np0:off ip=eno12409np1:off console=tty1 console=ttyS0,115200n8 + if [[ $cmdline == *"$NAMESPACE"* ]]; then + echo -e "Booted PXE image version \n $cmdline \n matches Prow namespace $NAMESPACE" + writeFSMFile $host "${NODE_BOOTED_IMAGE}" + else + echo -e "Booted PXE image version \n $cmdline \n DOES NOT match Prow namespace $NAMESPACE" + writeFSMFile $host "${EXIT_CODE_WRONG_VERSION}" + fi + else + # ISO BOOT + # BOOT_IMAGE=/images/pxeboot/vmlinuz coreos.liveiso=rhcos-415.92.202311241643-0 ignition.firstboot ignition.platform.id=metal + expected_x86_version=$(yq .architectures.x86_64.artifacts.metal.release $COREOS_STREAM_FILE) + echo "Expected x86_64 version to match booted image: $expected_x86_version" + expected_arm64_version=$(yq .architectures.aarch64.artifacts.metal.release $COREOS_STREAM_FILE) + echo "Expected ARM64 version to match booted image: $expected_arm64_version" + if [[ $cmdline == *"$expected_x86_version"* ]] || [[ $cmdline == *"$expected_arm64_version"* ]] ; then + echo -e "Booted ISO image version \n $cmdline \n matches expected version \n x86 $expected_x86_version arm64 $expected_arm64_version" + writeFSMFile $host "${NODE_BOOTED_IMAGE}" + else + echo -e "Booted ISO image version $cmdline" + echo -e "DOES NOT match expected versions : x86 $expected_x86_version arm64 $expected_arm64_version" + writeFSMFile $host "${EXIT_CODE_WRONG_VERSION}" + fi + fi + elif [[ $whatToCheck == "disk" ]]; then + echo "$cmdline \n" + # BOOT_IMAGE=(hd0,gpt3)/ostree/rhcos-8979e + if [[ $cmdline == *"ostree/rhcos"* ]]; then + echo -e "Red Hat CoreOS FOUND on disk" + writeFSMFile $host "${NODE_BOOTED_DISK}" + else + echo -e "Red Hat CoreOS NOT FOUND on disk" + writeFSMFile $host "${EXIT_CODE_COREOS_NOT_FOUND}" + fi + fi +} + +function checkNodes(){ + # shellcheck disable=SC2154 + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + isNodeAlive "${bmhost}" & + done +} + +function isPxeJob(){ + if [[ $JOB_NAME == *"-pxe-"* ]]; then + IS_PXE_JOB=true + fi + echo "Job name is $JOB_NAME , pxe? $IS_PXE_JOB" +} + +function ipmiRecord(){ + local bmhost="${1}" + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + echo "$vendor SoL recording on ${bmc_address}" + case $vendor in + "dell") + ssh "${SSHOPTS[@]}" -tt -q "root@${AUX_HOST}" "ipmitool -I lanplus -H $bmc_address -U $bmc_user -P $bmc_pass -z 8196 sol activate usesolkeepalive" >> "${ARTIFACT_DIR}/${name}_${ip}_ipmi.txt" & + ;; + "hpe") + ssh "${SSHOPTS[@]}" -tt -q "root@${AUX_HOST}" "hpecmd $host vsp" >> "${ARTIFACT_DIR}/${name}_${ip}_ipmi.txt" & + ;; + esac +} + + +function recordIPMILog(){ + echo "recordIPMILog" + # shellcheck disable=SC2154 + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + ipmiRecord $bmhost + done +} + + +function waitFor(){ + local fileToWait="${1}" + while [ ! -f "${fileToWait}" ]; do + printf "%s: waiting for %s\n" "$(date --utc --iso=s)" "${fileToWait}" + sleep 30 + done + printf "%s: acquired %s\n" "$(date --utc --iso=s)" "${fileToWait}" +} + + + +# Create a machine state tracking file for every host +# Use an associative array to bind host data to a file +function initFSM(){ + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + FSM_FILE=${FSM_FILE_PREFIX}${host} + touch $FSM_FILE + echo $NODE_STARTUP > $FSM_FILE + done +} + +INSTALL_SUCCESS=false + +function postInstall(){ + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + writeFSMFile $host "${INSTALL_COMPLETE}" + # Use code '0' when everything is working as expected, green junit report, append line only once + if [ "$INSTALL_SUCCESS" = "true" ]; then + echo 0 "${host} ${name}" >> "${ARTIFACT_DIR}/install-status.txt" + else + echo 1 "${host} ${name}" >> "${ARTIFACT_DIR}/install-status.txt" + fi + done + # Let the monitorFSM loop process the INSTALL_COMPLETE status + sleep 60 + echo "Killing processes running on bastion host before exit" + killPendingBastionProcesses +} + +function waitForInstallSuccess(){ + while [ ! -f "${INSTALL_SUCCESS_FILE}" ]; do + printf "%s: waiting for %s\n" "$(date --utc --iso=s)" "${INSTALL_SUCCESS_FILE}" + sleep 30 + done + printf "%s: acquired %s\n" "$(date --utc --iso=s)" "${INSTALL_SUCCESS_FILE}" + INSTALL_SUCCESS=true + postInstall +} + +function waitForInstallFailure(){ + while [ ! -f "${INSTALL_FAILURE_FILE}" ] && [ "$INSTALL_SUCCESS" == "false" ]; do + printf "%s: waiting for %s\n" "$(date --utc --iso=s)" "${INSTALL_FAILURE_FILE}" + sleep 30 + done + if [[ -f "${INSTALL_FAILURE_FILE}" ]]; then + printf "%s: acquired %s\n" "$(date --utc --iso=s)" "${INSTALL_FAILURE_FILE}" + # INSTALL_SUCCESS default value is false + postInstall + fi +} + +function waitForInstall(){ + waitForInstallSuccess & + waitForInstallFailure & +} + +function monitorFSM(){ + INSTALL_COMPLETED=false + while [ "$INSTALL_COMPLETED" = "false" ] + do + for bmhost in $(yq e -o=j -I=0 '.[]' "${HOSTS_FILE}"); do + . <(echo "$bmhost" | yq e 'to_entries | .[] | (.key + "=\"" + .value + "\"")') + FSM_FILE=${FSM_FILE_PREFIX}${host} + # Using file locks to prevent race conditions + { + flock -s 3 # wait for a read lock on FSM_FILE + status=$(cat <&3) # read contents of the FSM_FILE file from FD 3 + } 3<$FSM_FILE # all of this with FSM_FILE open to FD 3 + echo "filename is $FSM_FILE with status: $status" + case $status in + # state defined by function: isNodeAlive + "$NODE_IS_REACHABLE") + echo "Node ${host} alive, waiting for services to come up..." + # connections may not work even if SSH check passed + sleep 60 + # Once node is reachable, check if the correct image was booted + checkBootedImage "boot" "${bmhost}" + ;; + # state defined by function: checkBootedImage boot + "$NODE_BOOTED_IMAGE") + # If the correct image was booted, start recording journalctl logs and leverage SSH connection trap to detect reboot + journalRecord "${bmhost}" & + ;; + # state defined by function: journalRecord + "$NODE_INSTALLING") + echo "Node $host is installing" + ;; + # state defined by function: handleReboot + "$NODE_REBOOTING") + echo "Node $host is rebooting" + ;; + # state defined by function: handleReboot + "$NODE_REBOOTED") + # When the journalctl trap detects the first reboot, check if host booted correctly from disk + echo "node $host up again, checking booted image" + checkBootedImage "disk" "${bmhost}" + ;; + # state defined by function: checkBootedImage disk + "$NODE_BOOTED_DISK") + journalRecord "${bmhost}" & + ;; + # state defined by function: waitForInstall + "$INSTALL_COMPLETE") + echo "Node $host completed the install, exiting" + createInstallJunit + INSTALL_COMPLETED=true + break + ;; + # state defined by function: isNodeAlive + "$EXIT_CODE_UNREACHABLE") + handleNode "${bmhost}" "${EXIT_CODE_UNREACHABLE}" + ;; + # state defined by function: checkBootedImage boot + "$EXIT_CODE_WRONG_VERSION") + handleNode "${bmhost}" "${EXIT_CODE_WRONG_VERSION}" + ;; + # state defined by function: checkBootedImage disk + "$EXIT_CODE_COREOS_NOT_FOUND") + handleNode "${bmhost}" "${EXIT_CODE_COREOS_NOT_FOUND}" + ;; + # state defined by function: initFSM + "$NODE_STARTUP") + echo "Node $host is starting up" + ;; + esac + sleep 10 + done + done +} + +function retrieveCoreOSVersionFile(){ + CLUSTER_NAME=$(<"/var/run/secrets/ci.openshift.io/multi-stage/cluster_name") + scp -r "${SSHOPTS[@]}" "root@${AUX_HOST}:/var/builds/${CLUSTER_NAME}/coreos-stream.json" "/tmp/" +} + +function initObserverPod(){ + waitFor $HOSTS_FILE + waitFor $KUBECONFIG + retrieveCoreOSVersionFile + waitFor $COREOS_STREAM_FILE + isPxeJob + recordIPMILog + initFSM + checkNodes + monitorFSM & + waitForInstall & +} + + + +initObserverPod + +# Execution flow + +# Check nodes reachability through SSH +# If nodes are reachable, check if they booted from the live image +# If they booted from the live image, check the image version is correct +# Wait for reboot and check if nodes booted from disk +# Wait for reboot after rebase and check if nodes booted from disk +# On observer exit, create per-node junit reports with failures diff --git a/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.metadata.json b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.metadata.json new file mode 100644 index 0000000000000..ad8a4aa665989 --- /dev/null +++ b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.metadata.json @@ -0,0 +1,17 @@ +{ + "path": "observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.yaml", + "owners": { + "approvers": [ + "aleskandro", + "pamoedom", + "bmanzari" + ], + "reviewers": [ + "aleskandro", + "jadhaj", + "bmanzari", + "pamoedom", + "sgoveas" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.yaml b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.yaml new file mode 100644 index 0000000000000..6996c142f2fda --- /dev/null +++ b/ci-operator/step-registry/observers/ocp-baremetal-qe/observers-ocp-baremetal-qe-observer.yaml @@ -0,0 +1,15 @@ +observer: + name: observers-ocp-baremetal-qe + from_image: + namespace: ci + name: "baremetal-qe-base" + tag: latest + commands: observers-ocp-baremetal-qe-commands.sh + resources: + requests: + cpu: 500m + memory: 500Mi + timeout: 24h + grace_period: 1h + documentation: |- + Observe initial boot events of the hosts with common patterns for UPI/IPI/Agent, generate JUnits and self-heal