From a29650a94fa6bd8282426ba9841bbef71f20e22f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 17:14:30 +0000 Subject: [PATCH 01/14] Initial plan From c8074f452759e515ea61576f5803c7d407b0a739 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 17:21:10 +0000 Subject: [PATCH 02/14] Add agent e2e node config matrix Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/95afa82b-ea29-442e-810d-674a5d2980d3 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .github/workflows/agent-e2e-kind.yaml | 20 ++++ hack/agent/e2e-kind/e2e.py | 128 +++++++++++++++++++++++++- hack/agent/e2e-kind/run-local.sh | 8 ++ 3 files changed, 153 insertions(+), 3 deletions(-) diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index b56e4d29..4d11eb6d 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -48,14 +48,28 @@ permissions: jobs: agent-e2e: + name: agent e2e (${{ matrix.node_config.name }}) runs-on: ubuntu-24.04 timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + node_config: + - name: default + node_labels: "" + register_with_taints: "" + - name: labels-and-taints + node_labels: e2e.unbounded-cloud.io/config=labels-and-taints + register_with_taints: e2e.unbounded-cloud.io/dedicated=agent:NoSchedule env: KIND_CLUSTER_NAME: kind VM_NAME: agent-e2e VM_SUBNET: "192.168.100" VM_IP: "192.168.100.10" AGENT_MACHINE_NAME: agent-e2e + AGENT_E2E_CONFIG_NAME: ${{ matrix.node_config.name }} + AGENT_E2E_NODE_LABELS: ${{ matrix.node_config.node_labels }} + AGENT_E2E_REGISTER_WITH_TAINTS: ${{ matrix.node_config.register_with_taints }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -172,6 +186,9 @@ jobs: - name: Wait for node to become Ready run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node + - name: Validate node config + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-config + - name: Dump persisted agent config run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config @@ -209,6 +226,9 @@ jobs: - name: Wait for node to become Ready (rejoin) run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node + - name: Validate node config (rejoin) + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-config + - name: Dump persisted agent config (rejoin) run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 4792e36c..d2037383 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -19,11 +19,17 @@ Options: --verbose Enable diagnostic output (network diags). +Environment: + AGENT_E2E_CONFIG_NAME Name for the active node config variant. + AGENT_E2E_NODE_LABELS Comma-separated kubelet node labels. + AGENT_E2E_REGISTER_WITH_TAINTS Comma-separated kubelet registration taints. + Subcommands (called as individual workflow steps): create-vm Create bridge networking and launch a QEMU VM. ensure-kind-bridge Verify/repair veth pair connecting Kind to VM bridge. run-agent Build agent, generate bootstrap script, run on VM. wait-for-node Wait for the node to appear and become Ready. + validate-node-config Verify configured labels and taints reached the Node. dump-persisted-agent-config Print persisted agent config files from the VM. validate-workload Deploy test pods on the agent node. validate-kube-proxy Verify kube-proxy is Running on all nodes. @@ -75,6 +81,9 @@ KIND_CONTAINER = f"{KIND_CLUSTER_NAME}-control-plane" AGENT_MACHINE_NAME = os.environ.get("AGENT_MACHINE_NAME", "agent-e2e") AGENT_DEBUG = os.environ.get("AGENT_DEBUG", "") +AGENT_E2E_CONFIG_NAME = os.environ.get("AGENT_E2E_CONFIG_NAME", "default") +AGENT_E2E_NODE_LABELS = os.environ.get("AGENT_E2E_NODE_LABELS", "") +AGENT_E2E_REGISTER_WITH_TAINTS = os.environ.get("AGENT_E2E_REGISTER_WITH_TAINTS", "") # Site name used when generating the bootstrap script via kubectl-unbounded. E2E_SITE_NAME = "e2e" @@ -196,6 +205,60 @@ def _b64(val: str) -> str: return base64.b64encode(val.encode()).decode() +def _csv_env_values(raw: str) -> list[str]: + """Split a comma-separated environment value into non-empty entries.""" + return [item.strip() for item in raw.split(",") if item.strip()] + + +def expected_node_labels() -> dict[str, str]: + """Return labels configured for this e2e node variant.""" + labels: dict[str, str] = {} + for item in _csv_env_values(AGENT_E2E_NODE_LABELS): + if "=" not in item: + die(f"invalid AGENT_E2E_NODE_LABELS entry {item!r}, expected key=value") + key, value = item.split("=", 1) + if not key: + die(f"invalid AGENT_E2E_NODE_LABELS entry {item!r}, label key is empty") + labels[key] = value + return labels + + +def expected_node_taints() -> list[dict[str, str]]: + """Return taints configured for this e2e node variant.""" + taints: list[dict[str, str]] = [] + for item in _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS): + if ":" not in item: + die(f"invalid AGENT_E2E_REGISTER_WITH_TAINTS entry {item!r}, expected key[=value]:Effect") + body, effect = item.rsplit(":", 1) + if "=" in body: + key, value = body.split("=", 1) + else: + key, value = body, "" + if not key or not effect: + die(f"invalid AGENT_E2E_REGISTER_WITH_TAINTS entry {item!r}, key and effect are required") + taints.append({"key": key, "value": value, "effect": effect}) + return taints + + +def node_config_bootstrap_args() -> list[str]: + """Return manual-bootstrap flags for the active node config variant.""" + args: list[str] = [] + for label in _csv_env_values(AGENT_E2E_NODE_LABELS): + args.extend(["--node-label", label]) + for taint in _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS): + args.extend(["--register-with-taint", taint]) + return args + + +def log_active_node_config() -> None: + """Log the active e2e node config variant.""" + labels = _csv_env_values(AGENT_E2E_NODE_LABELS) + taints = _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS) + log(f"Agent e2e node config variant: {AGENT_E2E_CONFIG_NAME}") + log(f" node labels: {', '.join(labels) if labels else ''}") + log(f" register-with-taints: {', '.join(taints) if taints else ''}") + + def _machine_operation_resource() -> str: """Return the fully-qualified MachineOperation resource name.""" return "machineoperations.v1alpha3.unbounded-cloud.io" @@ -1041,6 +1104,7 @@ def _run_agent_inner(agent_url: str) -> None: # version, and cluster DNS from the active kubeconfig. The bootstrap # token is resolved via the site label on the secret. log("Generating bootstrap script with kubectl-unbounded machine manual-bootstrap...") + log_active_node_config() # Capture the local API server URL from the kubeconfig (typically # https://127.0.0.1: for Kind) so we can replace it with the @@ -1054,11 +1118,13 @@ def _run_agent_inner(agent_url: str) -> None: if not local_api_server: die("Could not determine local API server URL from kubeconfig") - bootstrap_script = capture([ + bootstrap_args = [ KUBECTL_UNBOUNDED, "machine", "manual-bootstrap", AGENT_MACHINE_NAME, "--site", E2E_SITE_NAME, - ]) + *node_config_bootstrap_args(), + ] + bootstrap_script = capture(bootstrap_args) # The kubeconfig uses a localhost address that is not reachable from the VM. # Patch the generated script to use the Kind container IP instead. @@ -1139,6 +1205,43 @@ def wait_for_node() -> None: kubectl(["get", "nodes", "-o", "wide"]) +# --------------------------------------------------------------------------- +# validate-node-config +# --------------------------------------------------------------------------- +def _assert_expected_node_config(node: dict[str, Any]) -> None: + expected_labels = expected_node_labels() + expected_taints = expected_node_taints() + + labels = node.get("metadata", {}).get("labels", {}) + for key, value in expected_labels.items(): + actual = labels.get(key) + if actual != value: + die(f"node label mismatch for {key!r}: got {actual!r}, expected {value!r}") + + taints = node.get("spec", {}).get("taints", []) + for expected in expected_taints: + if not any( + taint.get("key") == expected["key"] + and taint.get("value", "") == expected["value"] + and taint.get("effect") == expected["effect"] + for taint in taints + ): + die(f"expected node taint not found: {expected}; node taints: {taints}") + + +def validate_node_config() -> None: + """Verify configured node labels and taints are present on the Node.""" + + log_active_node_config() + node = json.loads(kubectl_capture(["get", "node", AGENT_MACHINE_NAME, "-o", "json"])) + _assert_expected_node_config(node) + + log("============================================") + log(" Node config validation PASSED") + log("============================================") + kubectl(["get", "node", AGENT_MACHINE_NAME, "-o", "wide"]) + + # --------------------------------------------------------------------------- # dump-persisted-agent-config # --------------------------------------------------------------------------- @@ -1839,6 +1942,19 @@ def validate_machine_cr_created() -> None: log(f"bootstrapTokenRef is correct: {token_ref}") + expected_labels = expected_node_labels() + actual_labels = k8s_spec.get("nodeLabels") or {} + for key, value in expected_labels.items(): + actual = actual_labels.get(key) + if actual != value: + die(f"Machine CR nodeLabels mismatch for {key!r}: got {actual!r}, expected {value!r}") + + expected_taints = _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS) + actual_taints = k8s_spec.get("registerWithTaints") or [] + for taint in expected_taints: + if taint not in actual_taints: + die(f"Machine CR registerWithTaints missing {taint!r}: {actual_taints}") + log("============================================") log(" Machine CR validation PASSED (created)") log("============================================") @@ -2002,7 +2118,10 @@ def validate_node_repave_upgrade() -> None: "template", {}, ).setdefault("kubernetes", {}) kubernetes_template["version"] = target_kubelet_version - kubernetes_template["nodeLabels"] = {"e2e.unbounded-cloud.io/config-version": "v3"} + kubernetes_template["nodeLabels"] = { + **expected_node_labels(), + "e2e.unbounded-cloud.io/config-version": "v3", + } kubectl(["apply", "-f", "-"], input=json.dumps(manifest).encode()) timeout_secs = 120 @@ -2049,6 +2168,8 @@ def validate_node_repave_upgrade() -> None: wait_for_node_absent(AGENT_MACHINE_NAME) wait_for_node() wait_for_node_kubelet_version(AGENT_MACHINE_NAME, target_kubelet_version) + node = json.loads(kubectl_capture(["get", "node", AGENT_MACHINE_NAME, "-o", "json"])) + _assert_expected_node_config(node) machine = json.loads(kubectl_capture(["get", "machine", AGENT_MACHINE_NAME, "-o", "json"])) status_config = machine.get("status", {}).get("configuration", {}) @@ -2145,6 +2266,7 @@ def cleanup() -> None: "dump-persisted-agent-config": dump_persisted_agent_config, "run-agent": run_agent, "wait-for-node": wait_for_node, + "validate-node-config": validate_node_config, "validate-kube-proxy": validate_kube_proxy, "validate-workload": validate_workload, "install-machine-crd": install_machine_crd, diff --git a/hack/agent/e2e-kind/run-local.sh b/hack/agent/e2e-kind/run-local.sh index 1ebe185b..7514f97c 100755 --- a/hack/agent/e2e-kind/run-local.sh +++ b/hack/agent/e2e-kind/run-local.sh @@ -26,6 +26,12 @@ # Usage: # ./hack/agent/e2e-kind/run-local.sh # ./hack/agent/e2e-kind/run-local.sh --verbose # enable diagnostic output +# +# Optional node config variant environment: +# AGENT_E2E_CONFIG_NAME=labels-and-taints \ +# AGENT_E2E_NODE_LABELS=e2e.unbounded-cloud.io/config=labels-and-taints \ +# AGENT_E2E_REGISTER_WITH_TAINTS=e2e.unbounded-cloud.io/dedicated=agent:NoSchedule \ +# ./hack/agent/e2e-kind/run-local.sh set -euo pipefail @@ -229,6 +235,7 @@ echo "" python3 "$E2E" $E2E_VERBOSE run-agent python3 "$E2E" $E2E_VERBOSE wait-for-node +python3 "$E2E" $E2E_VERBOSE validate-node-config python3 "$E2E" $E2E_VERBOSE dump-persisted-agent-config python3 "$E2E" $E2E_VERBOSE validate-kube-proxy python3 "$E2E" $E2E_VERBOSE validate-machine-cr-created @@ -249,6 +256,7 @@ python3 "$E2E" $E2E_VERBOSE delete-machine-cr python3 "$E2E" $E2E_VERBOSE ensure-kind-bridge python3 "$E2E" $E2E_VERBOSE run-agent python3 "$E2E" $E2E_VERBOSE wait-for-node +python3 "$E2E" $E2E_VERBOSE validate-node-config python3 "$E2E" $E2E_VERBOSE dump-persisted-agent-config python3 "$E2E" $E2E_VERBOSE validate-kube-proxy python3 "$E2E" $E2E_VERBOSE validate-machine-cr-created From 7e108adeff5bbec6cb0775cc1e0f5a47b3b10f85 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 17:50:18 +0000 Subject: [PATCH 03/14] Address agent e2e review feedback Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/bf7223a8-8101-4d8e-94c4-6f60a177756b Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .github/workflows/agent-e2e-kind.yaml | 213 ++++++++++++++++-- hack/agent/e2e-kind/e2e.py | 102 ++++++--- .../node-configs/labels-and-taints.json | 9 + hack/agent/e2e-kind/run-local.sh | 70 +++--- 4 files changed, 312 insertions(+), 82 deletions(-) create mode 100644 hack/agent/e2e-kind/node-configs/labels-and-taints.json diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index 4d11eb6d..63eea49a 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -48,28 +48,14 @@ permissions: jobs: agent-e2e: - name: agent e2e (${{ matrix.node_config.name }}) runs-on: ubuntu-24.04 timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - node_config: - - name: default - node_labels: "" - register_with_taints: "" - - name: labels-and-taints - node_labels: e2e.unbounded-cloud.io/config=labels-and-taints - register_with_taints: e2e.unbounded-cloud.io/dedicated=agent:NoSchedule env: KIND_CLUSTER_NAME: kind VM_NAME: agent-e2e VM_SUBNET: "192.168.100" VM_IP: "192.168.100.10" AGENT_MACHINE_NAME: agent-e2e - AGENT_E2E_CONFIG_NAME: ${{ matrix.node_config.name }} - AGENT_E2E_NODE_LABELS: ${{ matrix.node_config.node_labels }} - AGENT_E2E_REGISTER_WITH_TAINTS: ${{ matrix.node_config.register_with_taints }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -186,9 +172,6 @@ jobs: - name: Wait for node to become Ready run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node - - name: Validate node config - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-config - - name: Dump persisted agent config run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config @@ -226,9 +209,6 @@ jobs: - name: Wait for node to become Ready (rejoin) run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node - - name: Validate node config (rejoin) - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-config - - name: Dump persisted agent config (rejoin) run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config @@ -322,3 +302,196 @@ jobs: - name: Cleanup if: always() run: python3 ./hack/agent/e2e-kind/e2e.py --verbose cleanup + + agent-config-e2e: + name: agent config e2e (${{ matrix.node_config.name }}) + runs-on: ubuntu-24.04 + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + node_config: + - name: labels-and-taints + file: hack/agent/e2e-kind/node-configs/labels-and-taints.json + env: + KIND_CLUSTER_NAME: agent-config-e2e + VM_NAME: agent-config-e2e + VM_SUBNET: "192.168.110" + VM_IP: "192.168.110.10" + AGENT_MACHINE_NAME: agent-config-e2e + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Enable KVM + run: | + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + qemu-system-x86 qemu-utils genisoimage \ + iptables + + - name: Create Kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + version: v0.29.0 + + - name: Configure Kind cluster networking for VM + run: | + set -euo pipefail + KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" + KIND_IP=$(docker inspect "${KIND_CONTAINER}" \ + --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') + if [[ -z "${KIND_IP}" ]]; then + echo "::error::Could not determine Kind control-plane container IP" + exit 1 + fi + echo "[INFO] Kind control-plane IP: ${KIND_IP}" + BRIDGE="virbr-e2e" + + sudo iptables -I FORWARD -i "${BRIDGE}" -j ACCEPT + sudo iptables -I FORWARD -o "${BRIDGE}" -j ACCEPT + sudo iptables -t raw -I PREROUTING -i "${BRIDGE}" -j ACCEPT + + echo "[INFO] Patching kindnet DaemonSet for VM-reachable control plane endpoint..." + PATCH=$(cat </dev/null || true + sudo ip link add veth-kind-e2e type veth peer name eth-e2e + sudo ip link set veth-kind-e2e master "${BRIDGE}" + sudo ip link set veth-kind-e2e up + sudo ip link set eth-e2e netns "${KIND_PID}" + sudo nsenter -t "${KIND_PID}" -n ip addr add "${VM_SUBNET}.2/24" dev eth-e2e + sudo nsenter -t "${KIND_PID}" -n ip link set eth-e2e up + + - name: Install Machine CRD + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd + + - name: Start machina controller + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose start-machina-controller + + - name: Validate machina controller + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller + + - name: Run agent with node config + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" run-agent + + - name: Wait for configured node to become Ready + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node + + - name: Validate node config + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-node-config + + - name: Dump persisted agent config + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config + + - name: Validate kube-proxy on all nodes + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-kube-proxy + + - name: Validate Machine CR config + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-machine-cr-created + + - name: Validate workload on configured node + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-workload + + - name: Validate node config repave + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-node-repave-upgrade + + - name: Collect VM logs + if: always() + run: | + mkdir -p logs + VM_DIR=".vm-e2e" + cp "${VM_DIR}/${VM_NAME}.log" logs/vm-serial.log 2>/dev/null || true + SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -i ${VM_DIR}/ssh/id_ed25519 ubuntu@${VM_IP}" + $SSH "sudo journalctl --no-pager -l" > logs/vm-journal.log 2>/dev/null || true + $SSH "sudo journalctl -u unbounded-agent --no-pager -l" > logs/vm-unbounded-agent.log 2>/dev/null || true + $SSH "sudo journalctl -u unbounded-agent-daemon --no-pager -l" > logs/vm-unbounded-agent-daemon.log 2>/dev/null || true + cp ".vm-e2e/machina-controller.log" logs/machina-controller.log 2>/dev/null || true + $SSH "sudo machinectl list --no-pager" > logs/vm-machines.txt 2>/dev/null || true + for MACHINE in kube1 kube2; do + $SSH "sudo journalctl -M ${MACHINE} --no-pager -l" > logs/nspawn-${MACHINE}-journal.log 2>/dev/null || true + $SSH "sudo journalctl -M ${MACHINE} -u kubelet --no-pager -l" > logs/nspawn-${MACHINE}-kubelet.log 2>/dev/null || true + $SSH "sudo journalctl -M ${MACHINE} -u containerd --no-pager -l" > logs/nspawn-${MACHINE}-containerd.log 2>/dev/null || true + $SSH "sudo machinectl status ${MACHINE} --no-pager" > logs/vm-machine-${MACHINE}-status.txt 2>/dev/null || true + done + + - name: Collect cluster state + if: always() + run: | + mkdir -p logs + KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" + kubectl get nodes -o wide > logs/nodes.txt 2>&1 || true + kubectl describe nodes > logs/nodes-describe.txt 2>&1 || true + kubectl get pods -A -o wide > logs/pods.txt 2>&1 || true + kubectl get events -A --sort-by='.lastTimestamp' > logs/events.txt 2>&1 || true + kubectl get machines -o wide > logs/machines.txt 2>&1 || true + kubectl get machines -o yaml > logs/machines-full.yaml 2>&1 || true + kubectl get machineconfigurations -o wide > logs/machineconfigurations.txt 2>&1 || true + kubectl get machineconfigurations -o yaml > logs/machineconfigurations-full.yaml 2>&1 || true + kubectl get machineconfigurationversions -o wide > logs/machineconfigurationversions.txt 2>&1 || true + kubectl get machineconfigurationversions -o yaml > logs/machineconfigurationversions-full.yaml 2>&1 || true + kubectl get machineoperations -o wide > logs/machineoperations.txt 2>&1 || true + kubectl get machineoperations -o yaml > logs/machineoperations-full.yaml 2>&1 || true + docker exec "${KIND_CONTAINER}" journalctl -u kubelet --no-pager -l > logs/kind-kubelet.log 2>&1 || true + docker exec "${KIND_CONTAINER}" crictl logs $(docker exec "${KIND_CONTAINER}" crictl ps -a --name kube-apiserver -q 2>/dev/null | head -1) > logs/kube-apiserver.log 2>&1 || true + kubectl get csr -o wide > logs/csrs.txt 2>&1 || true + kubectl describe csr > logs/csrs-describe.txt 2>&1 || true + kubectl describe pods -n e2e-workload-test > logs/workload-pods-describe.txt 2>&1 || true + kubectl logs -n e2e-workload-test --all-containers --prefix e2e-hello > logs/workload-hello.log 2>&1 || true + kubectl logs -n e2e-workload-test --all-containers --prefix e2e-dns-test > logs/workload-dns.log 2>&1 || true + + - name: Upload logs + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: always() + with: + name: agent-config-e2e-${{ matrix.node_config.name }}-logs + path: logs/ + retention-days: 30 + + - name: Cleanup + if: always() + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose cleanup diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index d2037383..90831907 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -18,11 +18,7 @@ Options: --verbose Enable diagnostic output (network diags). - -Environment: - AGENT_E2E_CONFIG_NAME Name for the active node config variant. - AGENT_E2E_NODE_LABELS Comma-separated kubelet node labels. - AGENT_E2E_REGISTER_WITH_TAINTS Comma-separated kubelet registration taints. + --node-config PATH JSON file with node config variant settings. Subcommands (called as individual workflow steps): create-vm Create bridge networking and launch a QEMU VM. @@ -81,9 +77,6 @@ KIND_CONTAINER = f"{KIND_CLUSTER_NAME}-control-plane" AGENT_MACHINE_NAME = os.environ.get("AGENT_MACHINE_NAME", "agent-e2e") AGENT_DEBUG = os.environ.get("AGENT_DEBUG", "") -AGENT_E2E_CONFIG_NAME = os.environ.get("AGENT_E2E_CONFIG_NAME", "default") -AGENT_E2E_NODE_LABELS = os.environ.get("AGENT_E2E_NODE_LABELS", "") -AGENT_E2E_REGISTER_WITH_TAINTS = os.environ.get("AGENT_E2E_REGISTER_WITH_TAINTS", "") # Site name used when generating the bootstrap script via kubectl-unbounded. E2E_SITE_NAME = "e2e" @@ -120,6 +113,11 @@ MACHINE_CONFIG_NAME = f"{AGENT_MACHINE_NAME}-config" DAEMON_BINARY_CURRENT = "/usr/local/bin/unbounded-agent-current" DAEMON_BINARY_LAST_GOOD = "/usr/local/bin/unbounded-agent-last-good" +NODE_CONFIG: dict[str, Any] = { + "name": "default", + "nodeLabels": {}, + "registerWithTaints": [], +} # --------------------------------------------------------------------------- @@ -205,37 +203,73 @@ def _b64(val: str) -> str: return base64.b64encode(val.encode()).decode() -def _csv_env_values(raw: str) -> list[str]: - """Split a comma-separated environment value into non-empty entries.""" - return [item.strip() for item in raw.split(",") if item.strip()] +def load_node_config(path: str | None) -> dict[str, Any]: + """Load a node config variant from *path*, or return the default config.""" + if not path: + return { + "name": "default", + "nodeLabels": {}, + "registerWithTaints": [], + } + + config_path = Path(path) + if not config_path.is_absolute(): + config_path = REPO_ROOT / config_path + + try: + cfg = json.loads(config_path.read_text()) + except Exception as exc: + die(f"failed to read node config {config_path}: {exc}") + + if not isinstance(cfg, dict): + die(f"node config {config_path} must contain a JSON object") + + name = cfg.get("name", config_path.stem) + node_labels = cfg.get("nodeLabels", {}) + register_with_taints = cfg.get("registerWithTaints", []) + + if not isinstance(name, str) or not name: + die(f"node config {config_path} field 'name' must be a non-empty string") + if not isinstance(node_labels, dict) or not all( + isinstance(key, str) and isinstance(value, str) + for key, value in node_labels.items() + ): + die(f"node config {config_path} field 'nodeLabels' must be an object of string values") + if not isinstance(register_with_taints, list) or not all( + isinstance(taint, str) for taint in register_with_taints + ): + die(f"node config {config_path} field 'registerWithTaints' must be a list of strings") + + return { + "name": name, + "nodeLabels": dict(node_labels), + "registerWithTaints": list(register_with_taints), + } def expected_node_labels() -> dict[str, str]: """Return labels configured for this e2e node variant.""" - labels: dict[str, str] = {} - for item in _csv_env_values(AGENT_E2E_NODE_LABELS): - if "=" not in item: - die(f"invalid AGENT_E2E_NODE_LABELS entry {item!r}, expected key=value") - key, value = item.split("=", 1) - if not key: - die(f"invalid AGENT_E2E_NODE_LABELS entry {item!r}, label key is empty") - labels[key] = value - return labels + return dict(NODE_CONFIG["nodeLabels"]) + + +def expected_node_taint_strings() -> list[str]: + """Return configured taint strings for this e2e node variant.""" + return list(NODE_CONFIG["registerWithTaints"]) def expected_node_taints() -> list[dict[str, str]]: """Return taints configured for this e2e node variant.""" taints: list[dict[str, str]] = [] - for item in _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS): + for item in expected_node_taint_strings(): if ":" not in item: - die(f"invalid AGENT_E2E_REGISTER_WITH_TAINTS entry {item!r}, expected key[=value]:Effect") + die(f"invalid registerWithTaints entry {item!r}, expected key[=value]:Effect") body, effect = item.rsplit(":", 1) if "=" in body: key, value = body.split("=", 1) else: key, value = body, "" if not key or not effect: - die(f"invalid AGENT_E2E_REGISTER_WITH_TAINTS entry {item!r}, key and effect are required") + die(f"invalid registerWithTaints entry {item!r}, key and effect are required") taints.append({"key": key, "value": value, "effect": effect}) return taints @@ -243,18 +277,18 @@ def expected_node_taints() -> list[dict[str, str]]: def node_config_bootstrap_args() -> list[str]: """Return manual-bootstrap flags for the active node config variant.""" args: list[str] = [] - for label in _csv_env_values(AGENT_E2E_NODE_LABELS): - args.extend(["--node-label", label]) - for taint in _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS): + for key, value in sorted(expected_node_labels().items()): + args.extend(["--node-label", f"{key}={value}"]) + for taint in expected_node_taint_strings(): args.extend(["--register-with-taint", taint]) return args def log_active_node_config() -> None: """Log the active e2e node config variant.""" - labels = _csv_env_values(AGENT_E2E_NODE_LABELS) - taints = _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS) - log(f"Agent e2e node config variant: {AGENT_E2E_CONFIG_NAME}") + labels = [f"{key}={value}" for key, value in sorted(expected_node_labels().items())] + taints = expected_node_taint_strings() + log(f"Agent e2e node config variant: {NODE_CONFIG['name']}") log(f" node labels: {', '.join(labels) if labels else ''}") log(f" register-with-taints: {', '.join(taints) if taints else ''}") @@ -1949,7 +1983,7 @@ def validate_machine_cr_created() -> None: if actual != value: die(f"Machine CR nodeLabels mismatch for {key!r}: got {actual!r}, expected {value!r}") - expected_taints = _csv_env_values(AGENT_E2E_REGISTER_WITH_TAINTS) + expected_taints = expected_node_taint_strings() actual_taints = k8s_spec.get("registerWithTaints") or [] for taint in expected_taints: if taint not in actual_taints: @@ -2284,7 +2318,7 @@ def cleanup() -> None: def main() -> None: - global VERBOSE # noqa: PLW0603 + global NODE_CONFIG, VERBOSE # noqa: PLW0603 parser = argparse.ArgumentParser( description="Agent E2E Kind test harness", @@ -2300,8 +2334,14 @@ def main() -> None: default=False, help="Enable verbose diagnostic output", ) + parser.add_argument( + "--node-config", + default="", + help="Path to a JSON node config variant file", + ) args = parser.parse_args() VERBOSE = args.verbose + NODE_CONFIG = load_node_config(args.node_config) COMMANDS[args.command]() diff --git a/hack/agent/e2e-kind/node-configs/labels-and-taints.json b/hack/agent/e2e-kind/node-configs/labels-and-taints.json new file mode 100644 index 00000000..2391c42b --- /dev/null +++ b/hack/agent/e2e-kind/node-configs/labels-and-taints.json @@ -0,0 +1,9 @@ +{ + "name": "labels-and-taints", + "nodeLabels": { + "e2e.unbounded-cloud.io/config": "labels-and-taints" + }, + "registerWithTaints": [ + "e2e.unbounded-cloud.io/dedicated=agent:NoSchedule" + ] +} diff --git a/hack/agent/e2e-kind/run-local.sh b/hack/agent/e2e-kind/run-local.sh index 7514f97c..5762cae3 100755 --- a/hack/agent/e2e-kind/run-local.sh +++ b/hack/agent/e2e-kind/run-local.sh @@ -26,23 +26,31 @@ # Usage: # ./hack/agent/e2e-kind/run-local.sh # ./hack/agent/e2e-kind/run-local.sh --verbose # enable diagnostic output -# -# Optional node config variant environment: -# AGENT_E2E_CONFIG_NAME=labels-and-taints \ -# AGENT_E2E_NODE_LABELS=e2e.unbounded-cloud.io/config=labels-and-taints \ -# AGENT_E2E_REGISTER_WITH_TAINTS=e2e.unbounded-cloud.io/dedicated=agent:NoSchedule \ -# ./hack/agent/e2e-kind/run-local.sh +# ./hack/agent/e2e-kind/run-local.sh \ +# --node-config hack/agent/e2e-kind/node-configs/labels-and-taints.json set -euo pipefail REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)" E2E="${REPO_ROOT}/hack/agent/e2e-kind/e2e.py" -# Forward --verbose to e2e.py when passed to this script. -E2E_VERBOSE="" -for arg in "$@"; do +# Forward selected options to e2e.py. +E2E_ARGS=() +while [[ $# -gt 0 ]]; do + arg="$1" case "$arg" in - --verbose) E2E_VERBOSE="--verbose" ;; + --verbose) + E2E_ARGS+=("--verbose") + shift + ;; + --node-config) + if [[ $# -lt 2 ]]; then + echo "[ERROR] --node-config requires a path" >&2 + exit 1 + fi + E2E_ARGS+=("--node-config" "$2") + shift 2 + ;; *) echo "[ERROR] Unknown argument: $arg" >&2; exit 1 ;; esac done @@ -130,7 +138,7 @@ cleanup_forwarding() { cleanup() { info "Running cleanup..." cleanup_forwarding "${BRIDGE}" - python3 "$E2E" $E2E_VERBOSE cleanup 2>/dev/null || true + python3 "$E2E" "${E2E_ARGS[@]}" cleanup 2>/dev/null || true kind delete cluster --name "${KIND_CLUSTER_NAME}" 2>/dev/null || true } trap cleanup EXIT @@ -200,7 +208,7 @@ kubectl -n kube-system rollout status daemonset/kindnet --timeout=60s # --------------------------------------------------------------------------- # QEMU VM # --------------------------------------------------------------------------- -python3 "$E2E" $E2E_VERBOSE create-vm +python3 "$E2E" "${E2E_ARGS[@]}" create-vm # Attach Kind container to VM bridge via a veth pair so that the VM # subnet is directly reachable at L2. @@ -222,7 +230,7 @@ fi # --------------------------------------------------------------------------- # Install Machine CRD # --------------------------------------------------------------------------- -python3 "$E2E" $E2E_VERBOSE install-machine-crd +python3 "$E2E" "${E2E_ARGS[@]}" install-machine-crd # --------------------------------------------------------------------------- # Initial join: agent self-registers Machine CR @@ -233,13 +241,13 @@ echo " Phase 1: Initial join (no pre-existing CR)" echo "============================================" echo "" -python3 "$E2E" $E2E_VERBOSE run-agent -python3 "$E2E" $E2E_VERBOSE wait-for-node -python3 "$E2E" $E2E_VERBOSE validate-node-config -python3 "$E2E" $E2E_VERBOSE dump-persisted-agent-config -python3 "$E2E" $E2E_VERBOSE validate-kube-proxy -python3 "$E2E" $E2E_VERBOSE validate-machine-cr-created -python3 "$E2E" $E2E_VERBOSE validate-workload +python3 "$E2E" "${E2E_ARGS[@]}" run-agent +python3 "$E2E" "${E2E_ARGS[@]}" wait-for-node +python3 "$E2E" "${E2E_ARGS[@]}" validate-node-config +python3 "$E2E" "${E2E_ARGS[@]}" dump-persisted-agent-config +python3 "$E2E" "${E2E_ARGS[@]}" validate-kube-proxy +python3 "$E2E" "${E2E_ARGS[@]}" validate-machine-cr-created +python3 "$E2E" "${E2E_ARGS[@]}" validate-workload # --------------------------------------------------------------------------- # Reset and rejoin @@ -250,17 +258,17 @@ echo " Phase 2: Reset and rejoin" echo "============================================" echo "" -python3 "$E2E" $E2E_VERBOSE reset-agent -python3 "$E2E" $E2E_VERBOSE delete-machine-cr - -python3 "$E2E" $E2E_VERBOSE ensure-kind-bridge -python3 "$E2E" $E2E_VERBOSE run-agent -python3 "$E2E" $E2E_VERBOSE wait-for-node -python3 "$E2E" $E2E_VERBOSE validate-node-config -python3 "$E2E" $E2E_VERBOSE dump-persisted-agent-config -python3 "$E2E" $E2E_VERBOSE validate-kube-proxy -python3 "$E2E" $E2E_VERBOSE validate-machine-cr-created -python3 "$E2E" $E2E_VERBOSE validate-workload +python3 "$E2E" "${E2E_ARGS[@]}" reset-agent +python3 "$E2E" "${E2E_ARGS[@]}" delete-machine-cr + +python3 "$E2E" "${E2E_ARGS[@]}" ensure-kind-bridge +python3 "$E2E" "${E2E_ARGS[@]}" run-agent +python3 "$E2E" "${E2E_ARGS[@]}" wait-for-node +python3 "$E2E" "${E2E_ARGS[@]}" validate-node-config +python3 "$E2E" "${E2E_ARGS[@]}" dump-persisted-agent-config +python3 "$E2E" "${E2E_ARGS[@]}" validate-kube-proxy +python3 "$E2E" "${E2E_ARGS[@]}" validate-machine-cr-created +python3 "$E2E" "${E2E_ARGS[@]}" validate-workload # --------------------------------------------------------------------------- # Done (cleanup runs via trap) From 288226f53cb7e2bae792367f42da48f93e767521 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 18:04:05 +0000 Subject: [PATCH 04/14] Address agent e2e follow-up feedback Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/442e0e77-0155-41c7-82c5-4b62834be2b7 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .../agent-e2e-kind-control-plane/action.yaml | 105 ++++++++++ .github/workflows/agent-e2e-kind.yaml | 184 +----------------- hack/agent/e2e-kind/e2e.py | 124 ++++++------ hack/agent/e2e-kind/node-configs/README.md | 5 + 4 files changed, 187 insertions(+), 231 deletions(-) create mode 100644 .github/actions/agent-e2e-kind-control-plane/action.yaml create mode 100644 hack/agent/e2e-kind/node-configs/README.md diff --git a/.github/actions/agent-e2e-kind-control-plane/action.yaml b/.github/actions/agent-e2e-kind-control-plane/action.yaml new file mode 100644 index 00000000..664a1b41 --- /dev/null +++ b/.github/actions/agent-e2e-kind-control-plane/action.yaml @@ -0,0 +1,105 @@ +name: Setup agent e2e Kind control plane +description: Set up KVM, dependencies, a Kind control plane, and VM bridge networking for agent e2e tests. +inputs: + cluster-name: + description: Kind cluster name. + required: true + vm-subnet: + description: VM subnet prefix. + required: true +runs: + using: composite + steps: + - name: Enable KVM + shell: bash + run: | + echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules + sudo udevadm control --reload-rules + sudo udevadm trigger --name-match=kvm + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install system dependencies + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends \ + qemu-system-x86 qemu-utils genisoimage \ + iptables + + - name: Create Kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + cluster_name: ${{ inputs['cluster-name'] }} + version: v0.29.0 + + - name: Configure Kind cluster networking for VM + shell: bash + env: + KIND_CLUSTER_NAME: ${{ inputs['cluster-name'] }} + run: | + set -euo pipefail + KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" + KIND_IP=$(docker inspect "${KIND_CONTAINER}" \ + --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') + if [[ -z "${KIND_IP}" ]]; then + echo "::error::Could not determine Kind control-plane container IP" + exit 1 + fi + echo "[INFO] Kind control-plane IP: ${KIND_IP}" + BRIDGE="virbr-e2e" + + sudo iptables -I FORWARD -i "${BRIDGE}" -j ACCEPT + sudo iptables -I FORWARD -o "${BRIDGE}" -j ACCEPT + sudo iptables -t raw -I PREROUTING -i "${BRIDGE}" -j ACCEPT + + echo "[INFO] Patching kindnet DaemonSet for VM-reachable control plane endpoint..." + PATCH=$(cat </dev/null || true + sudo ip link add veth-kind-e2e type veth peer name eth-e2e + sudo ip link set veth-kind-e2e master "${BRIDGE}" + sudo ip link set veth-kind-e2e up + sudo ip link set eth-e2e netns "${KIND_PID}" + sudo nsenter -t "${KIND_PID}" -n ip addr add "${VM_SUBNET}.2/24" dev eth-e2e + sudo nsenter -t "${KIND_PID}" -n ip link set eth-e2e up diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index 63eea49a..62b172f3 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -60,101 +60,11 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Enable KVM - run: | - echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules - sudo udevadm control --reload-rules - sudo udevadm trigger --name-match=kvm - - - name: Set up Go - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 - with: - go-version-file: go.mod - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - qemu-system-x86 qemu-utils genisoimage \ - iptables - - - name: Create Kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + - name: Set up test control plane + uses: ./.github/actions/agent-e2e-kind-control-plane with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - version: v0.29.0 - - - name: Configure Kind cluster networking for VM - run: | - set -euo pipefail - KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" - KIND_IP=$(docker inspect "${KIND_CONTAINER}" \ - --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') - if [[ -z "${KIND_IP}" ]]; then - echo "::error::Could not determine Kind control-plane container IP" - exit 1 - fi - echo "[INFO] Kind control-plane IP: ${KIND_IP}" - BRIDGE="virbr-e2e" - - # Allow forwarding between the VM bridge and Docker bridge. - sudo iptables -I FORWARD -i "${BRIDGE}" -j ACCEPT - sudo iptables -I FORWARD -o "${BRIDGE}" -j ACCEPT - - # Docker may insert raw PREROUTING DROP rules that block non-Docker - # traffic to container IPs. Insert an ACCEPT so the VM can reach the - # Kind API server. - sudo iptables -t raw -I PREROUTING -i "${BRIDGE}" -j ACCEPT - - # Patch kindnet so CONTROL_PLANE_ENDPOINT uses the container IP instead - # of the hostname (which is unresolvable from the VM). - echo "[INFO] Patching kindnet DaemonSet for VM-reachable control plane endpoint..." - PATCH=$(cat <" and the kernel rejects these when - # the gateway is only reachable via an indirect route. - echo "[INFO] Attaching Kind container to ${BRIDGE} bridge..." - KIND_PID=$(docker inspect "${KIND_CONTAINER}" --format '{{.State.Pid}}') - sudo ip link delete veth-kind-e2e 2>/dev/null || true - sudo ip link add veth-kind-e2e type veth peer name eth-e2e - sudo ip link set veth-kind-e2e master "${BRIDGE}" - sudo ip link set veth-kind-e2e up - sudo ip link set eth-e2e netns "${KIND_PID}" - sudo nsenter -t "${KIND_PID}" -n ip addr add "${VM_SUBNET}.2/24" dev eth-e2e - sudo nsenter -t "${KIND_PID}" -n ip link set eth-e2e up + cluster-name: ${{ env.KIND_CLUSTER_NAME }} + vm-subnet: ${{ env.VM_SUBNET }} - name: Install Machine CRD run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd @@ -323,89 +233,11 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Enable KVM - run: | - echo 'KERNEL=="kvm", GROUP="kvm", MODE="0666", OPTIONS+="static_node=kvm"' | sudo tee /etc/udev/rules.d/99-kvm4all.rules - sudo udevadm control --reload-rules - sudo udevadm trigger --name-match=kvm - - - name: Set up Go - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 - with: - go-version-file: go.mod - - - name: Install system dependencies - run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - qemu-system-x86 qemu-utils genisoimage \ - iptables - - - name: Create Kind cluster - uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + - name: Set up test control plane + uses: ./.github/actions/agent-e2e-kind-control-plane with: - cluster_name: ${{ env.KIND_CLUSTER_NAME }} - version: v0.29.0 - - - name: Configure Kind cluster networking for VM - run: | - set -euo pipefail - KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" - KIND_IP=$(docker inspect "${KIND_CONTAINER}" \ - --format '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}') - if [[ -z "${KIND_IP}" ]]; then - echo "::error::Could not determine Kind control-plane container IP" - exit 1 - fi - echo "[INFO] Kind control-plane IP: ${KIND_IP}" - BRIDGE="virbr-e2e" - - sudo iptables -I FORWARD -i "${BRIDGE}" -j ACCEPT - sudo iptables -I FORWARD -o "${BRIDGE}" -j ACCEPT - sudo iptables -t raw -I PREROUTING -i "${BRIDGE}" -j ACCEPT - - echo "[INFO] Patching kindnet DaemonSet for VM-reachable control plane endpoint..." - PATCH=$(cat </dev/null || true - sudo ip link add veth-kind-e2e type veth peer name eth-e2e - sudo ip link set veth-kind-e2e master "${BRIDGE}" - sudo ip link set veth-kind-e2e up - sudo ip link set eth-e2e netns "${KIND_PID}" - sudo nsenter -t "${KIND_PID}" -n ip addr add "${VM_SUBNET}.2/24" dev eth-e2e - sudo nsenter -t "${KIND_PID}" -n ip link set eth-e2e up + cluster-name: ${{ env.KIND_CLUSTER_NAME }} + vm-subnet: ${{ env.VM_SUBNET }} - name: Install Machine CRD run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 90831907..1b631a3f 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -57,7 +57,7 @@ from http.server import HTTPServer, SimpleHTTPRequestHandler from pathlib import Path from threading import Thread -from typing import Any +from typing import Any, Callable # --------------------------------------------------------------------------- # Paths and defaults @@ -113,11 +113,6 @@ MACHINE_CONFIG_NAME = f"{AGENT_MACHINE_NAME}-config" DAEMON_BINARY_CURRENT = "/usr/local/bin/unbounded-agent-current" DAEMON_BINARY_LAST_GOOD = "/usr/local/bin/unbounded-agent-last-good" -NODE_CONFIG: dict[str, Any] = { - "name": "default", - "nodeLabels": {}, - "registerWithTaints": [], -} # --------------------------------------------------------------------------- @@ -247,20 +242,20 @@ def load_node_config(path: str | None) -> dict[str, Any]: } -def expected_node_labels() -> dict[str, str]: +def expected_node_labels(node_config: dict[str, Any]) -> dict[str, str]: """Return labels configured for this e2e node variant.""" - return dict(NODE_CONFIG["nodeLabels"]) + return dict(node_config["nodeLabels"]) -def expected_node_taint_strings() -> list[str]: +def expected_node_taint_strings(node_config: dict[str, Any]) -> list[str]: """Return configured taint strings for this e2e node variant.""" - return list(NODE_CONFIG["registerWithTaints"]) + return list(node_config["registerWithTaints"]) -def expected_node_taints() -> list[dict[str, str]]: +def expected_node_taints(node_config: dict[str, Any]) -> list[dict[str, str]]: """Return taints configured for this e2e node variant.""" taints: list[dict[str, str]] = [] - for item in expected_node_taint_strings(): + for item in expected_node_taint_strings(node_config): if ":" not in item: die(f"invalid registerWithTaints entry {item!r}, expected key[=value]:Effect") body, effect = item.rsplit(":", 1) @@ -274,21 +269,21 @@ def expected_node_taints() -> list[dict[str, str]]: return taints -def node_config_bootstrap_args() -> list[str]: +def node_config_bootstrap_args(node_config: dict[str, Any]) -> list[str]: """Return manual-bootstrap flags for the active node config variant.""" args: list[str] = [] - for key, value in sorted(expected_node_labels().items()): + for key, value in sorted(expected_node_labels(node_config).items()): args.extend(["--node-label", f"{key}={value}"]) - for taint in expected_node_taint_strings(): + for taint in expected_node_taint_strings(node_config): args.extend(["--register-with-taint", taint]) return args -def log_active_node_config() -> None: +def log_active_node_config(node_config: dict[str, Any]) -> None: """Log the active e2e node config variant.""" - labels = [f"{key}={value}" for key, value in sorted(expected_node_labels().items())] - taints = expected_node_taint_strings() - log(f"Agent e2e node config variant: {NODE_CONFIG['name']}") + labels = [f"{key}={value}" for key, value in sorted(expected_node_labels(node_config).items())] + taints = expected_node_taint_strings(node_config) + log(f"Agent e2e node config variant: {node_config['name']}") log(f" node labels: {', '.join(labels) if labels else ''}") log(f" register-with-taints: {', '.join(taints) if taints else ''}") @@ -1024,7 +1019,7 @@ def ensure_kind_bridge() -> None: # --------------------------------------------------------------------------- # run-agent # --------------------------------------------------------------------------- -def run_agent() -> None: +def run_agent(node_config: dict[str, Any]) -> None: """Build agent, generate bootstrap script, and run it on the VM.""" if not SSH_KEY.exists(): @@ -1066,7 +1061,7 @@ def run_agent() -> None: log(f"Agent download URL: {agent_url}") try: - _run_agent_inner(agent_url) + _run_agent_inner(agent_url, node_config) finally: httpd.shutdown() @@ -1083,7 +1078,7 @@ def log_message(self, format: str, *args: Any) -> None: # noqa: A002 return Handler -def _run_agent_inner(agent_url: str) -> None: +def _run_agent_inner(agent_url: str, node_config: dict[str, Any]) -> None: """Core logic for run-agent (after HTTP server is up).""" # Determine the Kind control-plane IP so connectivity checks have the @@ -1138,7 +1133,7 @@ def _run_agent_inner(agent_url: str) -> None: # version, and cluster DNS from the active kubeconfig. The bootstrap # token is resolved via the site label on the secret. log("Generating bootstrap script with kubectl-unbounded machine manual-bootstrap...") - log_active_node_config() + log_active_node_config(node_config) # Capture the local API server URL from the kubeconfig (typically # https://127.0.0.1: for Kind) so we can replace it with the @@ -1156,7 +1151,7 @@ def _run_agent_inner(agent_url: str) -> None: KUBECTL_UNBOUNDED, "machine", "manual-bootstrap", AGENT_MACHINE_NAME, "--site", E2E_SITE_NAME, - *node_config_bootstrap_args(), + *node_config_bootstrap_args(node_config), ] bootstrap_script = capture(bootstrap_args) @@ -1242,9 +1237,9 @@ def wait_for_node() -> None: # --------------------------------------------------------------------------- # validate-node-config # --------------------------------------------------------------------------- -def _assert_expected_node_config(node: dict[str, Any]) -> None: - expected_labels = expected_node_labels() - expected_taints = expected_node_taints() +def _assert_expected_node_config(node: dict[str, Any], node_config: dict[str, Any]) -> None: + expected_labels = expected_node_labels(node_config) + expected_taints = expected_node_taints(node_config) labels = node.get("metadata", {}).get("labels", {}) for key, value in expected_labels.items(): @@ -1262,13 +1257,21 @@ def _assert_expected_node_config(node: dict[str, Any]) -> None: ): die(f"expected node taint not found: {expected}; node taints: {taints}") + internal_ips = [ + address.get("address") + for address in node.get("status", {}).get("addresses", []) + if address.get("type") == "InternalIP" + ] + if VM_IP not in internal_ips: + die(f"node InternalIP mismatch: got {internal_ips}, expected {VM_IP!r}") + -def validate_node_config() -> None: +def validate_node_config(node_config: dict[str, Any]) -> None: """Verify configured node labels and taints are present on the Node.""" - log_active_node_config() + log_active_node_config(node_config) node = json.loads(kubectl_capture(["get", "node", AGENT_MACHINE_NAME, "-o", "json"])) - _assert_expected_node_config(node) + _assert_expected_node_config(node, node_config) log("============================================") log(" Node config validation PASSED") @@ -1921,7 +1924,7 @@ def delete_machine_cr() -> None: # --------------------------------------------------------------------------- # validate-machine-cr-created # --------------------------------------------------------------------------- -def validate_machine_cr_created() -> None: +def validate_machine_cr_created(node_config: dict[str, Any]) -> None: """Validate the agent self-registered a Machine CR during bootstrap. The daemon registers the Machine CR at startup, so this function polls @@ -1976,14 +1979,14 @@ def validate_machine_cr_created() -> None: log(f"bootstrapTokenRef is correct: {token_ref}") - expected_labels = expected_node_labels() + expected_labels = expected_node_labels(node_config) actual_labels = k8s_spec.get("nodeLabels") or {} for key, value in expected_labels.items(): actual = actual_labels.get(key) if actual != value: die(f"Machine CR nodeLabels mismatch for {key!r}: got {actual!r}, expected {value!r}") - expected_taints = expected_node_taint_strings() + expected_taints = expected_node_taint_strings(node_config) actual_taints = k8s_spec.get("registerWithTaints") or [] for taint in expected_taints: if taint not in actual_taints: @@ -2129,7 +2132,7 @@ def _next_patch_version(version: str) -> str: return "v" + ".".join(parts) -def validate_node_repave_upgrade() -> None: +def validate_node_repave_upgrade(node_config: dict[str, Any]) -> None: """Validate OnDelete repave applies a new MCV Kubernetes version.""" config_name = MACHINE_CONFIG_NAME @@ -2153,7 +2156,7 @@ def validate_node_repave_upgrade() -> None: ).setdefault("kubernetes", {}) kubernetes_template["version"] = target_kubelet_version kubernetes_template["nodeLabels"] = { - **expected_node_labels(), + **expected_node_labels(node_config), "e2e.unbounded-cloud.io/config-version": "v3", } kubectl(["apply", "-f", "-"], input=json.dumps(manifest).encode()) @@ -2203,7 +2206,7 @@ def validate_node_repave_upgrade() -> None: wait_for_node() wait_for_node_kubelet_version(AGENT_MACHINE_NAME, target_kubelet_version) node = json.loads(kubectl_capture(["get", "node", AGENT_MACHINE_NAME, "-o", "json"])) - _assert_expected_node_config(node) + _assert_expected_node_config(node, node_config) machine = json.loads(kubectl_capture(["get", "machine", AGENT_MACHINE_NAME, "-o", "json"])) status_config = machine.get("status", {}).get("configuration", {}) @@ -2294,31 +2297,42 @@ def cleanup() -> None: # --------------------------------------------------------------------------- # main # --------------------------------------------------------------------------- -COMMANDS = { - "create-vm": create_vm, - "ensure-kind-bridge": ensure_kind_bridge, - "dump-persisted-agent-config": dump_persisted_agent_config, +Command = Callable[[dict[str, Any]], None] + + +def _without_node_config(func: Callable[[], None]) -> Command: + """Adapt a command that does not use node config settings.""" + def command(_node_config: dict[str, Any]) -> None: + func() + + return command + + +COMMANDS: dict[str, Command] = { + "create-vm": _without_node_config(create_vm), + "ensure-kind-bridge": _without_node_config(ensure_kind_bridge), + "dump-persisted-agent-config": _without_node_config(dump_persisted_agent_config), "run-agent": run_agent, - "wait-for-node": wait_for_node, + "wait-for-node": _without_node_config(wait_for_node), "validate-node-config": validate_node_config, - "validate-kube-proxy": validate_kube_proxy, - "validate-workload": validate_workload, - "install-machine-crd": install_machine_crd, - "start-machina-controller": start_machina_controller, - "validate-machina-controller": validate_machina_controller, - "delete-machine-cr": delete_machine_cr, + "validate-kube-proxy": _without_node_config(validate_kube_proxy), + "validate-workload": _without_node_config(validate_workload), + "install-machine-crd": _without_node_config(install_machine_crd), + "start-machina-controller": _without_node_config(start_machina_controller), + "validate-machina-controller": _without_node_config(validate_machina_controller), + "delete-machine-cr": _without_node_config(delete_machine_cr), "validate-machine-cr-created": validate_machine_cr_created, - "validate-node-reboot-operation": validate_node_reboot_operation, - "validate-agent-upgrade-operation": validate_agent_upgrade_operation, - "validate-agent-upgrade-rollback": validate_agent_upgrade_rollback, + "validate-node-reboot-operation": _without_node_config(validate_node_reboot_operation), + "validate-agent-upgrade-operation": _without_node_config(validate_agent_upgrade_operation), + "validate-agent-upgrade-rollback": _without_node_config(validate_agent_upgrade_rollback), "validate-node-repave-upgrade": validate_node_repave_upgrade, - "reset-agent": reset_agent, - "cleanup": cleanup, + "reset-agent": _without_node_config(reset_agent), + "cleanup": _without_node_config(cleanup), } def main() -> None: - global NODE_CONFIG, VERBOSE # noqa: PLW0603 + global VERBOSE # noqa: PLW0603 parser = argparse.ArgumentParser( description="Agent E2E Kind test harness", @@ -2341,9 +2355,9 @@ def main() -> None: ) args = parser.parse_args() VERBOSE = args.verbose - NODE_CONFIG = load_node_config(args.node_config) + node_config = load_node_config(args.node_config) - COMMANDS[args.command]() + COMMANDS[args.command](node_config) if __name__ == "__main__": diff --git a/hack/agent/e2e-kind/node-configs/README.md b/hack/agent/e2e-kind/node-configs/README.md new file mode 100644 index 00000000..35d1b12c --- /dev/null +++ b/hack/agent/e2e-kind/node-configs/README.md @@ -0,0 +1,5 @@ +# Agent e2e node configs + +This folder contains different kinds of agent config used by the agent e2e tests. +Each file describes one node config scenario that can be passed to `e2e.py` with +`--node-config`. From 7f2fc8ebe1e0e4abf01bb76430965607f23a4214 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 18:11:10 +0000 Subject: [PATCH 05/14] Discover agent node configs in e2e job Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/6951efd5-d828-4068-8b85-f5c2cc79ddc5 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .github/workflows/agent-e2e-kind.yaml | 65 ++++++++++++++------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index 62b172f3..89b66fbe 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -214,15 +214,9 @@ jobs: run: python3 ./hack/agent/e2e-kind/e2e.py --verbose cleanup agent-config-e2e: - name: agent config e2e (${{ matrix.node_config.name }}) + name: agent config e2e runs-on: ubuntu-24.04 - timeout-minutes: 45 - strategy: - fail-fast: false - matrix: - node_config: - - name: labels-and-taints - file: hack/agent/e2e-kind/node-configs/labels-and-taints.json + timeout-minutes: 60 env: KIND_CLUSTER_NAME: agent-config-e2e VM_NAME: agent-config-e2e @@ -248,29 +242,36 @@ jobs: - name: Validate machina controller run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller - - name: Run agent with node config - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" run-agent - - - name: Wait for configured node to become Ready - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node - - - name: Validate node config - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-node-config - - - name: Dump persisted agent config - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config - - - name: Validate kube-proxy on all nodes - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-kube-proxy - - - name: Validate Machine CR config - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-machine-cr-created - - - name: Validate workload on configured node - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-workload - - - name: Validate node config repave - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${{ matrix.node_config.file }}" validate-node-repave-upgrade + - name: Discover and validate node configs + run: | + set -euo pipefail + mapfile -t NODE_CONFIGS < <(find hack/agent/e2e-kind/node-configs \ + -maxdepth 1 -type f -name '*.json' | sort) + if [[ "${#NODE_CONFIGS[@]}" -eq 0 ]]; then + echo "::error::No node config scenarios found" + exit 1 + fi + + last_index=$((${#NODE_CONFIGS[@]} - 1)) + for index in "${!NODE_CONFIGS[@]}"; do + node_config="${NODE_CONFIGS[$index]}" + scenario="$(basename "${node_config}" .json)" + echo "::group::agent config e2e: ${scenario}" + python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" run-agent + python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node + python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-node-config + python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config + python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-kube-proxy + python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-machine-cr-created + python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-workload + python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-node-repave-upgrade + + if [[ "${index}" -lt "${last_index}" ]]; then + python3 ./hack/agent/e2e-kind/e2e.py --verbose reset-agent + python3 ./hack/agent/e2e-kind/e2e.py --verbose delete-machine-cr + fi + echo "::endgroup::" + done - name: Collect VM logs if: always() @@ -320,7 +321,7 @@ jobs: uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: always() with: - name: agent-config-e2e-${{ matrix.node_config.name }}-logs + name: agent-config-e2e-logs path: logs/ retention-days: 30 From 87d9312b289df0ddc55c981d26c96deae40000de Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 18:33:28 +0000 Subject: [PATCH 06/14] Parallelize agent node config e2e scenarios Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/f57fcc1a-6b19-4828-a063-9085a9e0b487 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .../agent-e2e-kind-control-plane/action.yaml | 10 + .../actions/agent-e2e-kind-logs/action.yaml | 25 ++ .github/workflows/agent-e2e-kind.yaml | 158 +------- .../app/machine_manual_bootstrap.go | 5 + .../app/machine_manual_bootstrap_test.go | 2 + hack/agent/e2e-kind/e2e.py | 355 ++++++++++++++++-- hack/agent/e2e-kind/node-configs/README.md | 3 +- hack/agent/e2e-kind/node-configs/node-ip.json | 7 + 8 files changed, 375 insertions(+), 190 deletions(-) create mode 100644 .github/actions/agent-e2e-kind-logs/action.yaml create mode 100644 hack/agent/e2e-kind/node-configs/node-ip.json diff --git a/.github/actions/agent-e2e-kind-control-plane/action.yaml b/.github/actions/agent-e2e-kind-control-plane/action.yaml index 664a1b41..1a738bce 100644 --- a/.github/actions/agent-e2e-kind-control-plane/action.yaml +++ b/.github/actions/agent-e2e-kind-control-plane/action.yaml @@ -7,6 +7,10 @@ inputs: vm-subnet: description: VM subnet prefix. required: true + create-vm: + description: Whether to launch the default e2e VM. + required: false + default: "true" runs: using: composite steps: @@ -81,9 +85,15 @@ runs: kubectl -n kube-system rollout status daemonset/kindnet --timeout=60s - name: Create QEMU VM + if: ${{ inputs['create-vm'] == 'true' }} shell: bash run: python3 ./hack/agent/e2e-kind/e2e.py --verbose create-vm + - name: Create VM bridge + if: ${{ inputs['create-vm'] != 'true' }} + shell: bash + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose create-vm-bridge + - name: Attach Kind container to VM bridge shell: bash env: diff --git a/.github/actions/agent-e2e-kind-logs/action.yaml b/.github/actions/agent-e2e-kind-logs/action.yaml new file mode 100644 index 00000000..f685c77b --- /dev/null +++ b/.github/actions/agent-e2e-kind-logs/action.yaml @@ -0,0 +1,25 @@ +name: Collect agent e2e Kind logs +description: Collect and upload diagnostics for agent e2e Kind jobs. +inputs: + artifact-name: + description: Name of the uploaded log artifact. + required: true + node-configs: + description: Whether to collect logs for discovered node config scenarios. + required: false + default: "false" +runs: + using: composite + steps: + - name: Collect logs + shell: bash + env: + COLLECT_NODE_CONFIG_LOGS: ${{ inputs['node-configs'] }} + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose collect-logs + + - name: Upload logs + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: ${{ inputs['artifact-name'] }} + path: logs/ + retention-days: 30 diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index 89b66fbe..c6c22fa7 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -137,77 +137,11 @@ jobs: - name: Validate node repave upgrade run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-repave-upgrade - - name: Collect VM logs - if: always() - run: | - mkdir -p logs - VM_DIR=".vm-e2e" - # Collect VM serial console log - cp "${VM_DIR}/${VM_NAME}.log" logs/vm-serial.log 2>/dev/null || true - SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -i ${VM_DIR}/ssh/id_ed25519 ubuntu@${VM_IP}" - # Collect full journal from the VM host (best-effort) - $SSH "sudo journalctl --no-pager -l" > logs/vm-journal.log 2>/dev/null || true - # Collect unbounded-agent logs from the VM host - $SSH "sudo journalctl -u unbounded-agent --no-pager -l" > logs/vm-unbounded-agent.log 2>/dev/null || true - # Collect unbounded-agent-daemon logs - $SSH "sudo journalctl -u unbounded-agent-daemon --no-pager -l" > logs/vm-unbounded-agent-daemon.log 2>/dev/null || true - cp ".vm-e2e/machina-controller.log" logs/machina-controller.log 2>/dev/null || true - # Kubelet and containerd run inside the nspawn container, not on - # the host. Use 'journalctl -M ' to read the container - # journal, and fall back to machinectl shell if that doesn't work. - # The nspawn machine name is fixed to kube1/kube2 (decoupled from - # the Kubernetes node name). - $SSH "sudo machinectl list --no-pager" > logs/vm-machines.txt 2>/dev/null || true - for MACHINE in kube1 kube2; do - $SSH "sudo journalctl -M ${MACHINE} --no-pager -l" > logs/nspawn-${MACHINE}-journal.log 2>/dev/null || true - $SSH "sudo journalctl -M ${MACHINE} -u kubelet --no-pager -l" > logs/nspawn-${MACHINE}-kubelet.log 2>/dev/null || true - $SSH "sudo journalctl -M ${MACHINE} -u containerd --no-pager -l" > logs/nspawn-${MACHINE}-containerd.log 2>/dev/null || true - $SSH "sudo machinectl status ${MACHINE} --no-pager" > logs/vm-machine-${MACHINE}-status.txt 2>/dev/null || true - $SSH "sudo machinectl shell ${MACHINE} /usr/bin/systemctl list-units --no-pager" > logs/nspawn-${MACHINE}-units.txt 2>/dev/null || true - done - - - name: Collect cluster state - if: always() - run: | - mkdir -p logs - kubectl get nodes -o wide > logs/nodes.txt 2>&1 || true - kubectl describe nodes > logs/nodes-describe.txt 2>&1 || true - kubectl get pods -A -o wide > logs/pods.txt 2>&1 || true - kubectl get events -A --sort-by='.lastTimestamp' > logs/events.txt 2>&1 || true - # Collect Machine CRs (if CRD is installed) - kubectl get machines -o wide > logs/machines.txt 2>&1 || true - kubectl get machines -o yaml > logs/machines-full.yaml 2>&1 || true - kubectl get machineconfigurations -o wide > logs/machineconfigurations.txt 2>&1 || true - kubectl get machineconfigurations -o yaml > logs/machineconfigurations-full.yaml 2>&1 || true - kubectl get machineconfigurationversions -o wide > logs/machineconfigurationversions.txt 2>&1 || true - kubectl get machineconfigurationversions -o yaml > logs/machineconfigurationversions-full.yaml 2>&1 || true - kubectl get machineoperations -o wide > logs/machineoperations.txt 2>&1 || true - kubectl get machineoperations -o yaml > logs/machineoperations-full.yaml 2>&1 || true - # Collect kubelet logs from the Kind control-plane - docker exec kind-control-plane journalctl -u kubelet --no-pager -l > logs/kind-kubelet.log 2>&1 || true - # Collect kube-apiserver logs (critical for diagnosing TLS bootstrap / RBAC issues) - docker exec kind-control-plane crictl logs $(docker exec kind-control-plane crictl ps -a --name kube-apiserver -q 2>/dev/null | head -1) > logs/kube-apiserver.log 2>&1 || true - # Dump all ClusterRoleBindings to see what RBAC kubeadm/Kind created - kubectl get clusterrolebindings -o wide > logs/clusterrolebindings.txt 2>&1 || true - kubectl get clusterrolebindings -o yaml > logs/clusterrolebindings-full.yaml 2>&1 || true - # List CSRs to see if the kubelet attempted TLS bootstrap - kubectl get csr -o wide > logs/csrs.txt 2>&1 || true - kubectl describe csr > logs/csrs-describe.txt 2>&1 || true - # Dump bootstrap token secrets (redact token-secret values) - kubectl get secrets -n kube-system -l 'kubernetes.io/legacy-token-last-used' -o wide > logs/bootstrap-tokens.txt 2>&1 || true - kubectl get secrets -n kube-system --field-selector type=bootstrap.kubernetes.io/token -o yaml > logs/bootstrap-token-secrets.yaml 2>&1 || true - # Collect workload test pod details - kubectl describe pods -n e2e-workload-test > logs/workload-pods-describe.txt 2>&1 || true - kubectl logs -n e2e-workload-test --all-containers --prefix e2e-hello > logs/workload-hello.log 2>&1 || true - kubectl logs -n e2e-workload-test --all-containers --prefix e2e-dns-test > logs/workload-dns.log 2>&1 || true - - - name: Upload logs - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + - name: Collect logs if: always() + uses: ./.github/actions/agent-e2e-kind-logs with: - name: agent-e2e-kind-logs - path: logs/ - retention-days: 30 + artifact-name: agent-e2e-kind-logs - name: Cleanup if: always() @@ -232,6 +166,7 @@ jobs: with: cluster-name: ${{ env.KIND_CLUSTER_NAME }} vm-subnet: ${{ env.VM_SUBNET }} + create-vm: "false" - name: Install Machine CRD run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd @@ -243,87 +178,14 @@ jobs: run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller - name: Discover and validate node configs - run: | - set -euo pipefail - mapfile -t NODE_CONFIGS < <(find hack/agent/e2e-kind/node-configs \ - -maxdepth 1 -type f -name '*.json' | sort) - if [[ "${#NODE_CONFIGS[@]}" -eq 0 ]]; then - echo "::error::No node config scenarios found" - exit 1 - fi - - last_index=$((${#NODE_CONFIGS[@]} - 1)) - for index in "${!NODE_CONFIGS[@]}"; do - node_config="${NODE_CONFIGS[$index]}" - scenario="$(basename "${node_config}" .json)" - echo "::group::agent config e2e: ${scenario}" - python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" run-agent - python3 ./hack/agent/e2e-kind/e2e.py --verbose wait-for-node - python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-node-config - python3 ./hack/agent/e2e-kind/e2e.py --verbose dump-persisted-agent-config - python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-kube-proxy - python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-machine-cr-created - python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-workload - python3 ./hack/agent/e2e-kind/e2e.py --verbose --node-config "${node_config}" validate-node-repave-upgrade - - if [[ "${index}" -lt "${last_index}" ]]; then - python3 ./hack/agent/e2e-kind/e2e.py --verbose reset-agent - python3 ./hack/agent/e2e-kind/e2e.py --verbose delete-machine-cr - fi - echo "::endgroup::" - done - - - name: Collect VM logs - if: always() - run: | - mkdir -p logs - VM_DIR=".vm-e2e" - cp "${VM_DIR}/${VM_NAME}.log" logs/vm-serial.log 2>/dev/null || true - SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 -i ${VM_DIR}/ssh/id_ed25519 ubuntu@${VM_IP}" - $SSH "sudo journalctl --no-pager -l" > logs/vm-journal.log 2>/dev/null || true - $SSH "sudo journalctl -u unbounded-agent --no-pager -l" > logs/vm-unbounded-agent.log 2>/dev/null || true - $SSH "sudo journalctl -u unbounded-agent-daemon --no-pager -l" > logs/vm-unbounded-agent-daemon.log 2>/dev/null || true - cp ".vm-e2e/machina-controller.log" logs/machina-controller.log 2>/dev/null || true - $SSH "sudo machinectl list --no-pager" > logs/vm-machines.txt 2>/dev/null || true - for MACHINE in kube1 kube2; do - $SSH "sudo journalctl -M ${MACHINE} --no-pager -l" > logs/nspawn-${MACHINE}-journal.log 2>/dev/null || true - $SSH "sudo journalctl -M ${MACHINE} -u kubelet --no-pager -l" > logs/nspawn-${MACHINE}-kubelet.log 2>/dev/null || true - $SSH "sudo journalctl -M ${MACHINE} -u containerd --no-pager -l" > logs/nspawn-${MACHINE}-containerd.log 2>/dev/null || true - $SSH "sudo machinectl status ${MACHINE} --no-pager" > logs/vm-machine-${MACHINE}-status.txt 2>/dev/null || true - done - - - name: Collect cluster state - if: always() - run: | - mkdir -p logs - KIND_CONTAINER="${KIND_CLUSTER_NAME}-control-plane" - kubectl get nodes -o wide > logs/nodes.txt 2>&1 || true - kubectl describe nodes > logs/nodes-describe.txt 2>&1 || true - kubectl get pods -A -o wide > logs/pods.txt 2>&1 || true - kubectl get events -A --sort-by='.lastTimestamp' > logs/events.txt 2>&1 || true - kubectl get machines -o wide > logs/machines.txt 2>&1 || true - kubectl get machines -o yaml > logs/machines-full.yaml 2>&1 || true - kubectl get machineconfigurations -o wide > logs/machineconfigurations.txt 2>&1 || true - kubectl get machineconfigurations -o yaml > logs/machineconfigurations-full.yaml 2>&1 || true - kubectl get machineconfigurationversions -o wide > logs/machineconfigurationversions.txt 2>&1 || true - kubectl get machineconfigurationversions -o yaml > logs/machineconfigurationversions-full.yaml 2>&1 || true - kubectl get machineoperations -o wide > logs/machineoperations.txt 2>&1 || true - kubectl get machineoperations -o yaml > logs/machineoperations-full.yaml 2>&1 || true - docker exec "${KIND_CONTAINER}" journalctl -u kubelet --no-pager -l > logs/kind-kubelet.log 2>&1 || true - docker exec "${KIND_CONTAINER}" crictl logs $(docker exec "${KIND_CONTAINER}" crictl ps -a --name kube-apiserver -q 2>/dev/null | head -1) > logs/kube-apiserver.log 2>&1 || true - kubectl get csr -o wide > logs/csrs.txt 2>&1 || true - kubectl describe csr > logs/csrs-describe.txt 2>&1 || true - kubectl describe pods -n e2e-workload-test > logs/workload-pods-describe.txt 2>&1 || true - kubectl logs -n e2e-workload-test --all-containers --prefix e2e-hello > logs/workload-hello.log 2>&1 || true - kubectl logs -n e2e-workload-test --all-containers --prefix e2e-dns-test > logs/workload-dns.log 2>&1 || true - - - name: Upload logs - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-configs + + - name: Collect logs if: always() + uses: ./.github/actions/agent-e2e-kind-logs with: - name: agent-config-e2e-logs - path: logs/ - retention-days: 30 + artifact-name: agent-config-e2e-logs + node-configs: "true" - name: Cleanup if: always() diff --git a/cmd/kubectl-unbounded/app/machine_manual_bootstrap.go b/cmd/kubectl-unbounded/app/machine_manual_bootstrap.go index f5eb1da9..a2151df7 100644 --- a/cmd/kubectl-unbounded/app/machine_manual_bootstrap.go +++ b/cmd/kubectl-unbounded/app/machine_manual_bootstrap.go @@ -73,6 +73,9 @@ type manualBootstrapHandler struct { // taints are taint strings passed through to kubelet --register-with-taints. taints []string + // nodeIP is passed through to kubelet --node-ip. + nodeIP string + // ociImage is an optional OCI image reference for the agent. When set, // it is included in the AgentConfig JSON so the agent uses a container // image to bootstrap the machine rootfs instead of debootstrap. @@ -301,6 +304,7 @@ func (h *manualBootstrapHandler) buildAgentConfig(ctx context.Context) (*provisi ProviderLabels: providerLabels, BootstrapToken: bootstrapToken, }) + cfg.Kubelet.NodeIP = strings.TrimSpace(h.nodeIP) return &cfg, nil } @@ -467,6 +471,7 @@ Examples: cmd.Flags().StringVar(&handler.kubeconfigPath, "kubeconfig", "", "Path to kubeconfig file") cmd.Flags().StringArrayVar(&handler.nodeLabels, "node-label", nil, "Label in key=value format to pass to kubelet (can be repeated)") cmd.Flags().StringArrayVar(&handler.taints, "register-with-taint", nil, "Taint to register on the node (can be repeated)") + cmd.Flags().StringVar(&handler.nodeIP, "node-ip", "", "IP address to pass to kubelet") cmd.Flags().StringVar(&handler.ociImage, "oci-image", "", "OCI image reference for the agent rootfs") cmd.Flags().StringVar(&handler.kubernetesVersion, "kubernetes-version", "", "Override the Kubernetes version (default: auto-detected from API server)") cmd.Flags().StringVar(&handler.variant, "variant", "script", "Output format: script or cloud-init") diff --git a/cmd/kubectl-unbounded/app/machine_manual_bootstrap_test.go b/cmd/kubectl-unbounded/app/machine_manual_bootstrap_test.go index 6a633464..1fe8b792 100644 --- a/cmd/kubectl-unbounded/app/machine_manual_bootstrap_test.go +++ b/cmd/kubectl-unbounded/app/machine_manual_bootstrap_test.go @@ -175,6 +175,7 @@ func TestManualBootstrapHandler_BuildAgentConfig(t *testing.T) { machineName: "my-node", nodeLabels: []string{"env=prod"}, taints: []string{"dedicated=gpu:NoSchedule"}, + nodeIP: " 10.0.0.15 ", ociImage: "ghcr.io/azure/rootfs:v1", kubeCli: kubeCli, kubeConfig: &rest.Config{Host: "https://my-api-server:6443"}, @@ -190,6 +191,7 @@ func TestManualBootstrapHandler_BuildAgentConfig(t *testing.T) { require.NotEmpty(t, cfg.Cluster.CaCertBase64) require.NotEmpty(t, cfg.Cluster.Version) // fake client returns empty string but it's still set require.Contains(t, cfg.Kubelet.Auth.BootstrapToken, "abc123.") + require.Equal(t, "10.0.0.15", cfg.Kubelet.NodeIP) require.Equal(t, map[string]string{"env": "prod"}, cfg.Kubelet.Labels) require.Equal(t, []string{"dedicated=gpu:NoSchedule"}, cfg.Kubelet.RegisterWithTaints) require.Equal(t, "ghcr.io/azure/rootfs:v1", cfg.OCIImage) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 1b631a3f..9c855849 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -38,6 +38,8 @@ validate-agent-upgrade-operation Verify AgentUpgrade switches the host daemon binary. validate-agent-upgrade-rollback Verify AgentUpgrade rollback restores last-known-good. validate-node-repave-upgrade Verify OnDelete repave applies a new MCV Kubernetes version. + validate-node-configs Discover and validate node config scenarios in parallel. + collect-logs Collect VM and cluster diagnostic logs. reset-agent Trigger AgentReset and verify cleanup. cleanup Tear down VM, networking, and Kind cluster. """ @@ -46,8 +48,10 @@ import argparse import base64 +import concurrent.futures import json import os +import re import secrets import shutil import subprocess @@ -72,6 +76,7 @@ VM_IP = os.environ.get("VM_IP", f"{VM_SUBNET}.10") VM_GATEWAY = f"{VM_SUBNET}.1" VM_DIR = Path(os.environ.get("VM_DIR", str(REPO_ROOT / ".vm-e2e"))) +NODE_CONFIG_DIR = REPO_ROOT / "hack" / "agent" / "e2e-kind" / "node-configs" KIND_CLUSTER_NAME = os.environ.get("KIND_CLUSTER_NAME", "kind") KIND_CONTAINER = f"{KIND_CLUSTER_NAME}-control-plane" @@ -85,7 +90,7 @@ NSPAWN_MACHINE_NAMES = ["kube1", "kube2"] BRIDGE_NAME = "virbr-e2e" -TAP_NAME = "tap-e2e" +TAP_NAME = os.environ.get("TAP_NAME", "tap-e2e") SERVE_PORT = 8199 AGENT_UPGRADE_ROLLBACK_MESSAGE_FRAGMENT = "rolled back" @@ -205,6 +210,7 @@ def load_node_config(path: str | None) -> dict[str, Any]: "name": "default", "nodeLabels": {}, "registerWithTaints": [], + "nodeIP": "", } config_path = Path(path) @@ -222,6 +228,7 @@ def load_node_config(path: str | None) -> dict[str, Any]: name = cfg.get("name", config_path.stem) node_labels = cfg.get("nodeLabels", {}) register_with_taints = cfg.get("registerWithTaints", []) + node_ip = cfg.get("nodeIP", "") if not isinstance(name, str) or not name: die(f"node config {config_path} field 'name' must be a non-empty string") @@ -234,11 +241,14 @@ def load_node_config(path: str | None) -> dict[str, Any]: isinstance(taint, str) for taint in register_with_taints ): die(f"node config {config_path} field 'registerWithTaints' must be a list of strings") + if not isinstance(node_ip, str): + die(f"node config {config_path} field 'nodeIP' must be a string") return { "name": name, "nodeLabels": dict(node_labels), "registerWithTaints": list(register_with_taints), + "nodeIP": node_ip, } @@ -252,6 +262,14 @@ def expected_node_taint_strings(node_config: dict[str, Any]) -> list[str]: return list(node_config["registerWithTaints"]) +def expected_node_ip(node_config: dict[str, Any]) -> str: + """Return the expected Node InternalIP for this e2e node variant.""" + node_ip = node_config.get("nodeIP", "") + if node_ip in ("$VM_IP", "${VM_IP}"): + return VM_IP + return node_ip or VM_IP + + def expected_node_taints(node_config: dict[str, Any]) -> list[dict[str, str]]: """Return taints configured for this e2e node variant.""" taints: list[dict[str, str]] = [] @@ -272,6 +290,9 @@ def expected_node_taints(node_config: dict[str, Any]) -> list[dict[str, str]]: def node_config_bootstrap_args(node_config: dict[str, Any]) -> list[str]: """Return manual-bootstrap flags for the active node config variant.""" args: list[str] = [] + node_ip = node_config.get("nodeIP", "") + if node_ip: + args.extend(["--node-ip", expected_node_ip(node_config)]) for key, value in sorted(expected_node_labels(node_config).items()): args.extend(["--node-label", f"{key}={value}"]) for taint in expected_node_taint_strings(node_config): @@ -283,11 +304,44 @@ def log_active_node_config(node_config: dict[str, Any]) -> None: """Log the active e2e node config variant.""" labels = [f"{key}={value}" for key, value in sorted(expected_node_labels(node_config).items())] taints = expected_node_taint_strings(node_config) + node_ip = node_config.get("nodeIP", "") log(f"Agent e2e node config variant: {node_config['name']}") + log(f" node ip: {expected_node_ip(node_config) if node_ip else ''}") log(f" node labels: {', '.join(labels) if labels else ''}") log(f" register-with-taints: {', '.join(taints) if taints else ''}") +def _safe_name(value: str) -> str: + """Return a DNS-label-safe name fragment for VM and node names.""" + safe = re.sub(r"[^a-z0-9-]+", "-", value.lower()).strip("-") + return safe or "config" + + +def discover_node_configs() -> list[dict[str, Any]]: + """Load all node config scenario files in deterministic order.""" + configs: list[dict[str, Any]] = [] + for path in sorted(NODE_CONFIG_DIR.glob("*.json")): + cfg = load_node_config(str(path)) + cfg["_path"] = str(path) + configs.append(cfg) + if not configs: + die(f"No node config scenarios found in {NODE_CONFIG_DIR}") + return configs + + +def scenario_env(node_config: dict[str, Any], index: int) -> dict[str, str]: + """Return per-scenario environment overrides for a parallel e2e node.""" + name = _safe_name(node_config["name"]) + vm_name = f"{VM_NAME}-{name}" + return { + "VM_NAME": vm_name, + "AGENT_MACHINE_NAME": vm_name, + "VM_IP": f"{VM_SUBNET}.{10 + index}", + "VM_DIR": str(VM_DIR / name), + "TAP_NAME": f"tap-e2e-{index}", + } + + def _machine_operation_resource() -> str: """Return the fully-qualified MachineOperation resource name.""" return "machineoperations.v1alpha3.unbounded-cloud.io" @@ -693,16 +747,14 @@ def _build_script_agent_tarball(tarball: Path, build_name: str, script: str) -> # --------------------------------------------------------------------------- # create-vm / recreate-vm helpers # --------------------------------------------------------------------------- -def _stop_qemu() -> None: - """Stop the QEMU VM process if it is running.""" - pid_file = VM_DIR / f"{VM_NAME}.pid" +def _stop_qemu_by_pid_file(pid_file: Path, vm_name: str) -> None: if not pid_file.exists(): return pid = int(pid_file.read_text().strip()) try: os.kill(pid, 0) - log(f"Stopping VM '{VM_NAME}' (PID: {pid})...") + log(f"Stopping VM '{vm_name}' (PID: {pid})...") os.kill(pid, 15) time.sleep(2) try: @@ -716,6 +768,11 @@ def _stop_qemu() -> None: pid_file.unlink(missing_ok=True) +def _stop_qemu() -> None: + """Stop the QEMU VM process if it is running.""" + _stop_qemu_by_pid_file(VM_DIR / f"{VM_NAME}.pid", VM_NAME) + + def _launch_vm(ssh_pub_key: str) -> None: """Create a fresh VM disk, cloud-init ISO, launch QEMU, and wait for SSH. @@ -875,9 +932,7 @@ def _launch_vm(ssh_pub_key: str) -> None: # --------------------------------------------------------------------------- # create-vm # --------------------------------------------------------------------------- -def create_vm() -> None: - """Create bridge networking and launch a QEMU VM.""" - +def _check_vm_prereqs() -> None: # Pre-flight for cmd in ("qemu-system-x86_64", "qemu-img", "genisoimage"): if shutil.which(cmd) is None: @@ -885,6 +940,8 @@ def create_vm() -> None: if not os.access("/dev/kvm", os.R_OK): die("/dev/kvm is not accessible. Enable KVM for hardware acceleration.") + +def _ensure_vm_ssh_key() -> str: VM_DIR.mkdir(parents=True, exist_ok=True) SSH_KEY_DIR.mkdir(parents=True, exist_ok=True) @@ -893,9 +950,11 @@ def create_vm() -> None: log("Generating SSH key pair...") run(["ssh-keygen", "-t", "ed25519", "-f", str(SSH_KEY), "-N", "", "-q"]) - ssh_pub_key = SSH_KEY.with_suffix(".pub").read_text().strip() + return SSH_KEY.with_suffix(".pub").read_text().strip() - # Create bridge network + +def create_vm_bridge() -> None: + """Create bridge networking shared by e2e VMs.""" log(f"Creating bridge network {BRIDGE_NAME}...") run_quiet(["sudo", "ip", "link", "del", BRIDGE_NAME], check=False) run(["sudo", "ip", "link", "add", BRIDGE_NAME, "type", "bridge"]) @@ -913,6 +972,18 @@ def create_vm() -> None: # Prevent NetworkManager from detaching interfaces from the bridge. _nm_unmanage(BRIDGE_NAME) + + +def launch_vm() -> None: + """Launch a QEMU VM on an existing e2e bridge.""" + _check_vm_prereqs() + ssh_pub_key = _ensure_vm_ssh_key() + + # TAP device + run_quiet(["sudo", "ip", "link", "delete", TAP_NAME], check=False) + run(["sudo", "ip", "tuntap", "add", "dev", TAP_NAME, "mode", "tap"]) + run(["sudo", "ip", "link", "set", TAP_NAME, "master", BRIDGE_NAME]) + run(["sudo", "ip", "link", "set", TAP_NAME, "up"]) _nm_unmanage(TAP_NAME) # Download Ubuntu cloud image @@ -927,6 +998,13 @@ def create_vm() -> None: _launch_vm(ssh_pub_key) +def create_vm() -> None: + """Create bridge networking and launch a QEMU VM.""" + _check_vm_prereqs() + create_vm_bridge() + launch_vm() + + # --------------------------------------------------------------------------- # ensure-kind-bridge # --------------------------------------------------------------------------- @@ -1028,6 +1106,32 @@ def run_agent(node_config: dict[str, Any]) -> None: if shutil.which(cmd) is None: die(f"{cmd} is required but not found in PATH") + agent_url_override = os.environ.get("AGENT_URL", "") + if agent_url_override: + _run_agent_inner(agent_url_override, node_config) + log("Agent bootstrap completed") + return + + agent_url = prepare_agent_artifacts() + log(f"Starting HTTP file server on {VM_GATEWAY}:{SERVE_PORT}...") + handler = _make_handler(str(VM_DIR)) + httpd = HTTPServer((VM_GATEWAY, SERVE_PORT), handler) + server_thread = Thread(target=httpd.serve_forever, daemon=True) + server_thread.start() + log(f"Agent download URL: {agent_url}") + + try: + _run_agent_inner(agent_url, node_config) + finally: + httpd.shutdown() + + log("Agent bootstrap completed") + + +def prepare_agent_artifacts() -> str: + """Build agent artifacts and return the URL that serves the tarball.""" + VM_DIR.mkdir(parents=True, exist_ok=True) + # Build agent binary and package as tarball log("Building unbounded-agent...") agent_bin = REPO_ROOT / "bin" / "unbounded-agent" @@ -1052,20 +1156,7 @@ def run_agent(node_config: dict[str, Any]) -> None: # Serve the tarball over HTTP runner_ip = VM_GATEWAY agent_url = f"http://{runner_ip}:{SERVE_PORT}/unbounded-agent-linux-amd64.tar.gz" - - log(f"Starting HTTP file server on {runner_ip}:{SERVE_PORT}...") - handler = _make_handler(str(VM_DIR)) - httpd = HTTPServer((runner_ip, SERVE_PORT), handler) - server_thread = Thread(target=httpd.serve_forever, daemon=True) - server_thread.start() - log(f"Agent download URL: {agent_url}") - - try: - _run_agent_inner(agent_url, node_config) - finally: - httpd.shutdown() - - log("Agent bootstrap completed") + return agent_url def _make_handler(directory: str) -> type: @@ -1262,8 +1353,9 @@ def _assert_expected_node_config(node: dict[str, Any], node_config: dict[str, An for address in node.get("status", {}).get("addresses", []) if address.get("type") == "InternalIP" ] - if VM_IP not in internal_ips: - die(f"node InternalIP mismatch: got {internal_ips}, expected {VM_IP!r}") + node_ip = expected_node_ip(node_config) + if node_ip not in internal_ips: + die(f"node InternalIP mismatch: got {internal_ips}, expected {node_ip!r}") def validate_node_config(node_config: dict[str, Any]) -> None: @@ -1279,6 +1371,74 @@ def validate_node_config(node_config: dict[str, Any]) -> None: kubectl(["get", "node", AGENT_MACHINE_NAME, "-o", "wide"]) +def _run_scenario_command(command: str, node_config: dict[str, Any], env: dict[str, str]) -> None: + args = [sys.executable, str(Path(__file__))] + if VERBOSE: + args.append("--verbose") + if "_path" in node_config: + args.extend(["--node-config", node_config["_path"]]) + args.append(command) + + child_env = {**os.environ, **env} + run(args, env=child_env) + + +def _validate_node_config_scenario(node_config: dict[str, Any], index: int, agent_url: str) -> None: + name = node_config["name"] + env = scenario_env(node_config, index) + env["AGENT_URL"] = agent_url + + log(f"Starting agent config scenario {name!r} on {env['VM_NAME']} ({env['VM_IP']})") + for command in ( + "launch-vm", + "run-agent", + "wait-for-node", + "validate-node-config", + "dump-persisted-agent-config", + "validate-machine-cr-created", + "validate-workload", + "validate-node-repave-upgrade", + ): + _run_scenario_command(command, node_config, env) + log(f"Agent config scenario {name!r} passed") + + +def validate_node_config_scenarios() -> None: + """Discover node config scenarios and validate them in parallel.""" + configs = discover_node_configs() + agent_url = prepare_agent_artifacts() + + log(f"Starting HTTP file server on {VM_GATEWAY}:{SERVE_PORT}...") + handler = _make_handler(str(VM_DIR)) + httpd = HTTPServer((VM_GATEWAY, SERVE_PORT), handler) + server_thread = Thread(target=httpd.serve_forever, daemon=True) + server_thread.start() + log(f"Agent download URL: {agent_url}") + + failures: list[str] = [] + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=len(configs)) as executor: + futures = { + executor.submit(_validate_node_config_scenario, cfg, index, agent_url): cfg["name"] + for index, cfg in enumerate(configs) + } + for future in concurrent.futures.as_completed(futures): + name = futures[future] + try: + future.result() + except subprocess.CalledProcessError as exc: + failures.append(f"{name}: {exc.cmd} exited with {exc.returncode}") + except Exception as exc: + failures.append(f"{name}: {exc}") + finally: + httpd.shutdown() + + if failures: + die("agent config scenario validation failed: " + "; ".join(failures)) + + validate_kube_proxy() + + # --------------------------------------------------------------------------- # dump-persisted-agent-config # --------------------------------------------------------------------------- @@ -1491,6 +1651,9 @@ def validate_workload() -> None: """Deploy test pods on the agent node and verify they run.""" timeout_secs = 300 + pod_suffix = _safe_name(AGENT_MACHINE_NAME) + hello_pod_name = f"e2e-hello-{pod_suffix}" + dns_pod_name = f"e2e-dns-test-{pod_suffix}" # Create test namespace (idempotent) log(f"Creating test namespace '{TEST_NS}'...") @@ -1499,7 +1662,7 @@ def validate_workload() -> None: kubectl(["apply", "-f", "-"], input=ns_yaml.encode()) # Clean up any stale pods from a previous run (e.g. after reset + rejoin) - for pod_name in ("e2e-hello", "e2e-dns-test"): + for pod_name in (hello_pod_name, dns_pod_name): run_quiet([KUBECTL, "delete", "pod", pod_name, "-n", TEST_NS, "--ignore-not-found"], check=False) @@ -1508,7 +1671,7 @@ def validate_workload() -> None: hello_pod = { "apiVersion": "v1", "kind": "Pod", - "metadata": {"name": "e2e-hello", "namespace": TEST_NS, "labels": {"app": "e2e-hello"}}, + "metadata": {"name": hello_pod_name, "namespace": TEST_NS, "labels": {"app": "e2e-hello"}}, "spec": { "nodeName": AGENT_MACHINE_NAME, "containers": [{ @@ -1523,28 +1686,28 @@ def validate_workload() -> None: kubectl(["apply", "-f", "-"], input=json.dumps(hello_pod).encode()) # Wait for Running - log("Waiting for pod 'e2e-hello' to be Running...") + log(f"Waiting for pod '{hello_pod_name}' to be Running...") elapsed = 0 while elapsed < timeout_secs: result = subprocess.run( - [KUBECTL, "get", "pod", "e2e-hello", "-n", TEST_NS, + [KUBECTL, "get", "pod", hello_pod_name, "-n", TEST_NS, "-o", "jsonpath={.status.phase}"], capture_output=True, text=True, ) phase = result.stdout.strip() if result.returncode == 0 else "" if phase == "Running": - log(f"Pod 'e2e-hello' is Running after {elapsed}s") + log(f"Pod '{hello_pod_name}' is Running after {elapsed}s") break if phase in ("Failed", "Unknown"): - subprocess.run([KUBECTL, "describe", "pod", "e2e-hello", "-n", TEST_NS], check=False) - die(f"Pod 'e2e-hello' entered {phase} state") + subprocess.run([KUBECTL, "describe", "pod", hello_pod_name, "-n", TEST_NS], check=False) + die(f"Pod '{hello_pod_name}' entered {phase} state") if elapsed > 0 and elapsed % 30 == 0: log(f" ({elapsed}s) Pod phase: {phase or 'Pending'}") time.sleep(5) elapsed += 5 else: - subprocess.run([KUBECTL, "describe", "pod", "e2e-hello", "-n", TEST_NS], check=False) - die(f"Timed out waiting for pod 'e2e-hello' to be Running after {timeout_secs}s") + subprocess.run([KUBECTL, "describe", "pod", hello_pod_name, "-n", TEST_NS], check=False) + die(f"Timed out waiting for pod '{hello_pod_name}' to be Running after {timeout_secs}s") # Emit network diagnostics before attempting kubectl logs. The API # server proxies log requests through the kubelet (port 10250) on the @@ -1560,7 +1723,7 @@ def validate_workload() -> None: log_attempts = 6 for attempt in range(1, log_attempts + 1): result = subprocess.run( - [KUBECTL, "logs", "e2e-hello", "-n", TEST_NS], + [KUBECTL, "logs", hello_pod_name, "-n", TEST_NS], capture_output=True, text=True, ) if result.returncode == 0: @@ -1571,7 +1734,7 @@ def validate_workload() -> None: time.sleep(5) else: log(f" kubectl logs failed (attempt {attempt}/{log_attempts}): {result.stderr.strip()}") - subprocess.run([KUBECTL, "describe", "pod", "e2e-hello", "-n", TEST_NS], check=False) + subprocess.run([KUBECTL, "describe", "pod", hello_pod_name, "-n", TEST_NS], check=False) die(f"kubectl logs failed after {log_attempts} attempts") print(logs, flush=True) @@ -1580,7 +1743,7 @@ def validate_workload() -> None: log("Pod logs contain expected message") # Verify node placement - pod_node = kubectl_capture(["get", "pod", "e2e-hello", "-n", TEST_NS, + pod_node = kubectl_capture(["get", "pod", hello_pod_name, "-n", TEST_NS, "-o", "jsonpath={.spec.nodeName}"]) if pod_node != AGENT_MACHINE_NAME: die(f"Pod is running on '{pod_node}' instead of '{AGENT_MACHINE_NAME}'") @@ -1591,7 +1754,7 @@ def validate_workload() -> None: dns_pod = { "apiVersion": "v1", "kind": "Pod", - "metadata": {"name": "e2e-dns-test", "namespace": TEST_NS}, + "metadata": {"name": dns_pod_name, "namespace": TEST_NS, "labels": {"app": "e2e-dns-test"}}, "spec": { "nodeName": AGENT_MACHINE_NAME, "containers": [{ @@ -1611,7 +1774,7 @@ def validate_workload() -> None: elapsed = 0 while elapsed < timeout_secs: result = subprocess.run( - [KUBECTL, "get", "pod", "e2e-dns-test", "-n", TEST_NS, + [KUBECTL, "get", "pod", dns_pod_name, "-n", TEST_NS, "-o", "jsonpath={.status.phase}"], capture_output=True, text=True, ) @@ -1629,7 +1792,7 @@ def validate_workload() -> None: elapsed += 5 dns_result = subprocess.run( - [KUBECTL, "logs", "e2e-dns-test", "-n", TEST_NS], + [KUBECTL, "logs", dns_pod_name, "-n", TEST_NS], capture_output=True, text=True, ) dns_logs = dns_result.stdout.strip() if dns_result.returncode == 0 else "" @@ -2228,6 +2391,105 @@ def validate_node_repave_upgrade(node_config: dict[str, Any]) -> None: kubectl(["get", "node", AGENT_MACHINE_NAME, "-o", "wide"]) +# --------------------------------------------------------------------------- +# collect-logs +# --------------------------------------------------------------------------- +def _write_command_log(path: Path, args: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w") as out: + subprocess.run(args, stdout=out, stderr=subprocess.STDOUT, check=False) + + +def _collect_one_vm_logs(logs_dir: Path, vm_name: str, vm_ip: str, vm_dir: Path, prefix: str) -> None: + serial_log = vm_dir / f"{vm_name}.log" + if serial_log.exists(): + shutil.copyfile(serial_log, logs_dir / f"{prefix}vm-serial.log") + + ssh_opts = [ + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "ConnectTimeout=5", + "-i", str(vm_dir / "ssh" / "id_ed25519"), + ] + ssh_target = f"ubuntu@{vm_ip}" + + def ssh_log(name: str, command: str) -> None: + _write_command_log(logs_dir / f"{prefix}{name}", ["ssh", *ssh_opts, ssh_target, command]) + + ssh_log("vm-journal.log", "sudo journalctl --no-pager -l") + ssh_log("vm-unbounded-agent.log", "sudo journalctl -u unbounded-agent --no-pager -l") + ssh_log("vm-unbounded-agent-daemon.log", "sudo journalctl -u unbounded-agent-daemon --no-pager -l") + ssh_log("vm-machines.txt", "sudo machinectl list --no-pager") + for machine in NSPAWN_MACHINE_NAMES: + ssh_log(f"nspawn-{machine}-journal.log", f"sudo journalctl -M {machine} --no-pager -l") + ssh_log(f"nspawn-{machine}-kubelet.log", f"sudo journalctl -M {machine} -u kubelet --no-pager -l") + ssh_log(f"nspawn-{machine}-containerd.log", f"sudo journalctl -M {machine} -u containerd --no-pager -l") + ssh_log(f"vm-machine-{machine}-status.txt", f"sudo machinectl status {machine} --no-pager") + ssh_log( + f"nspawn-{machine}-units.txt", + f"sudo machinectl shell {machine} /usr/bin/systemctl list-units --no-pager", + ) + + +def collect_logs() -> None: + """Collect VM and cluster diagnostics into the logs directory.""" + logs_dir = REPO_ROOT / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + + if os.environ.get("COLLECT_NODE_CONFIG_LOGS", "").lower() == "true": + for index, cfg in enumerate(discover_node_configs()): + env = scenario_env(cfg, index) + prefix = f"{_safe_name(cfg['name'])}-" + _collect_one_vm_logs( + logs_dir, + env["VM_NAME"], + env["VM_IP"], + Path(env["VM_DIR"]), + prefix, + ) + else: + _collect_one_vm_logs(logs_dir, VM_NAME, VM_IP, VM_DIR, "") + + if MACHINA_LOG_FILE.exists(): + shutil.copyfile(MACHINA_LOG_FILE, logs_dir / "machina-controller.log") + + _write_command_log(logs_dir / "nodes.txt", [KUBECTL, "get", "nodes", "-o", "wide"]) + _write_command_log(logs_dir / "nodes-describe.txt", [KUBECTL, "describe", "nodes"]) + _write_command_log(logs_dir / "pods.txt", [KUBECTL, "get", "pods", "-A", "-o", "wide"]) + _write_command_log(logs_dir / "events.txt", [KUBECTL, "get", "events", "-A", "--sort-by=.lastTimestamp"]) + _write_command_log(logs_dir / "machines.txt", [KUBECTL, "get", "machines", "-o", "wide"]) + _write_command_log(logs_dir / "machines-full.yaml", [KUBECTL, "get", "machines", "-o", "yaml"]) + _write_command_log(logs_dir / "machineconfigurations.txt", [KUBECTL, "get", "machineconfigurations", "-o", "wide"]) + _write_command_log(logs_dir / "machineconfigurations-full.yaml", [KUBECTL, "get", "machineconfigurations", "-o", "yaml"]) + _write_command_log(logs_dir / "machineconfigurationversions.txt", [KUBECTL, "get", "machineconfigurationversions", "-o", "wide"]) + _write_command_log(logs_dir / "machineconfigurationversions-full.yaml", [KUBECTL, "get", "machineconfigurationversions", "-o", "yaml"]) + _write_command_log(logs_dir / "machineoperations.txt", [KUBECTL, "get", "machineoperations", "-o", "wide"]) + _write_command_log(logs_dir / "machineoperations-full.yaml", [KUBECTL, "get", "machineoperations", "-o", "yaml"]) + _write_command_log(logs_dir / "kind-kubelet.log", ["docker", "exec", KIND_CONTAINER, "journalctl", "-u", "kubelet", "--no-pager", "-l"]) + kube_apiserver = subprocess.run( + ["docker", "exec", KIND_CONTAINER, "crictl", "ps", "-a", "--name", "kube-apiserver", "-q"], + capture_output=True, text=True, check=False, + ) + apiserver_id = kube_apiserver.stdout.splitlines()[0] if kube_apiserver.stdout.splitlines() else "" + if apiserver_id: + _write_command_log(logs_dir / "kube-apiserver.log", ["docker", "exec", KIND_CONTAINER, "crictl", "logs", apiserver_id]) + _write_command_log(logs_dir / "clusterrolebindings.txt", [KUBECTL, "get", "clusterrolebindings", "-o", "wide"]) + _write_command_log(logs_dir / "clusterrolebindings-full.yaml", [KUBECTL, "get", "clusterrolebindings", "-o", "yaml"]) + _write_command_log(logs_dir / "csrs.txt", [KUBECTL, "get", "csr", "-o", "wide"]) + _write_command_log(logs_dir / "csrs-describe.txt", [KUBECTL, "describe", "csr"]) + _write_command_log( + logs_dir / "bootstrap-tokens.txt", + [KUBECTL, "get", "secrets", "-n", "kube-system", "-l", "kubernetes.io/legacy-token-last-used", "-o", "wide"], + ) + _write_command_log( + logs_dir / "bootstrap-token-secrets.yaml", + [KUBECTL, "get", "secrets", "-n", "kube-system", "--field-selector", "type=bootstrap.kubernetes.io/token", "-o", "yaml"], + ) + _write_command_log(logs_dir / "workload-pods-describe.txt", [KUBECTL, "describe", "pods", "-n", TEST_NS]) + _write_command_log(logs_dir / "workload-hello.log", [KUBECTL, "logs", "-n", TEST_NS, "--all-containers", "--prefix", "-l", "app=e2e-hello"]) + _write_command_log(logs_dir / "workload-dns.log", [KUBECTL, "logs", "-n", TEST_NS, "--all-containers", "--prefix", "-l", "app=e2e-dns-test"]) + + # --------------------------------------------------------------------------- # cleanup # --------------------------------------------------------------------------- @@ -2244,10 +2506,17 @@ def cleanup() -> None: # Stop QEMU VM _stop_qemu() + if os.environ.get("COLLECT_NODE_CONFIG_LOGS", "").lower() == "true" or VM_NAME == "agent-config-e2e": + for index, cfg in enumerate(discover_node_configs()): + env = scenario_env(cfg, index) + _stop_qemu_by_pid_file(Path(env["VM_DIR"]) / f"{env['VM_NAME']}.pid", env["VM_NAME"]) # Remove networking log("Cleaning up networking...") run_quiet(["sudo", "ip", "link", "del", TAP_NAME], check=False) + if VM_NAME == "agent-config-e2e": + for index, _cfg in enumerate(discover_node_configs()): + run_quiet(["sudo", "ip", "link", "del", f"tap-e2e-{index}"], check=False) run_quiet(["sudo", "ip", "link", "del", BRIDGE_NAME], check=False) # Remove iptables/nftables forwarding rules (best-effort). @@ -2309,9 +2578,12 @@ def command(_node_config: dict[str, Any]) -> None: COMMANDS: dict[str, Command] = { + "collect-logs": _without_node_config(collect_logs), + "create-vm-bridge": _without_node_config(create_vm_bridge), "create-vm": _without_node_config(create_vm), "ensure-kind-bridge": _without_node_config(ensure_kind_bridge), "dump-persisted-agent-config": _without_node_config(dump_persisted_agent_config), + "launch-vm": _without_node_config(launch_vm), "run-agent": run_agent, "wait-for-node": _without_node_config(wait_for_node), "validate-node-config": validate_node_config, @@ -2326,6 +2598,7 @@ def command(_node_config: dict[str, Any]) -> None: "validate-agent-upgrade-operation": _without_node_config(validate_agent_upgrade_operation), "validate-agent-upgrade-rollback": _without_node_config(validate_agent_upgrade_rollback), "validate-node-repave-upgrade": validate_node_repave_upgrade, + "validate-node-configs": _without_node_config(validate_node_config_scenarios), "reset-agent": _without_node_config(reset_agent), "cleanup": _without_node_config(cleanup), } diff --git a/hack/agent/e2e-kind/node-configs/README.md b/hack/agent/e2e-kind/node-configs/README.md index 35d1b12c..6f2ee209 100644 --- a/hack/agent/e2e-kind/node-configs/README.md +++ b/hack/agent/e2e-kind/node-configs/README.md @@ -2,4 +2,5 @@ This folder contains different kinds of agent config used by the agent e2e tests. Each file describes one node config scenario that can be passed to `e2e.py` with -`--node-config`. +`--node-config`. Scenarios may configure kubelet settings such as labels, taints, +or the node IP used by the e2e VM. diff --git a/hack/agent/e2e-kind/node-configs/node-ip.json b/hack/agent/e2e-kind/node-configs/node-ip.json new file mode 100644 index 00000000..1483f56a --- /dev/null +++ b/hack/agent/e2e-kind/node-configs/node-ip.json @@ -0,0 +1,7 @@ +{ + "name": "node-ip", + "nodeIP": "$VM_IP", + "nodeLabels": { + "e2e.unbounded-cloud.io/config": "node-ip" + } +} From d1acd3069713016247b72a0ef68f869b861d9422 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 18:59:43 +0000 Subject: [PATCH 07/14] Extract machina setup and type node configs Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/dfdf4bdc-d5a8-4652-aa3d-f5fe82aa2f3a Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- .../agent-e2e-machina-setup/action.yaml | 16 +++ .github/workflows/agent-e2e-kind.yaml | 20 +--- hack/agent/e2e-kind/e2e.py | 100 +++++++++--------- 3 files changed, 71 insertions(+), 65 deletions(-) create mode 100644 .github/actions/agent-e2e-machina-setup/action.yaml diff --git a/.github/actions/agent-e2e-machina-setup/action.yaml b/.github/actions/agent-e2e-machina-setup/action.yaml new file mode 100644 index 00000000..a905aa5e --- /dev/null +++ b/.github/actions/agent-e2e-machina-setup/action.yaml @@ -0,0 +1,16 @@ +name: Set up agent e2e machina resources +description: Install Machine CRDs and start/validate the machina controller for agent e2e tests. +runs: + using: composite + steps: + - name: Install Machine CRD + shell: bash + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd + + - name: Start machina controller + shell: bash + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose start-machina-controller + + - name: Validate machina controller + shell: bash + run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index c6c22fa7..8d2a9cc3 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -66,14 +66,8 @@ jobs: cluster-name: ${{ env.KIND_CLUSTER_NAME }} vm-subnet: ${{ env.VM_SUBNET }} - - name: Install Machine CRD - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd - - - name: Start machina controller - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose start-machina-controller - - - name: Validate machina controller - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller + - name: Set up machina resources + uses: ./.github/actions/agent-e2e-machina-setup # No Machine CR pre-created; agent self-registers during bootstrap. - name: Run agent to join VM to cluster @@ -168,14 +162,8 @@ jobs: vm-subnet: ${{ env.VM_SUBNET }} create-vm: "false" - - name: Install Machine CRD - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose install-machine-crd - - - name: Start machina controller - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose start-machina-controller - - - name: Validate machina controller - run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-machina-controller + - name: Set up machina resources + uses: ./.github/actions/agent-e2e-machina-setup - name: Discover and validate node configs run: python3 ./hack/agent/e2e-kind/e2e.py --verbose validate-node-configs diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 9c855849..8e67312e 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -58,6 +58,7 @@ import sys import textwrap import time +from dataclasses import dataclass from http.server import HTTPServer, SimpleHTTPRequestHandler from pathlib import Path from threading import Thread @@ -203,15 +204,19 @@ def _b64(val: str) -> str: return base64.b64encode(val.encode()).decode() -def load_node_config(path: str | None) -> dict[str, Any]: +@dataclass(frozen=True) +class NodeConfig: + name: str + node_labels: dict[str, str] + register_with_taints: list[str] + node_ip: str = "" + path: str = "" + + +def load_node_config(path: str | None) -> NodeConfig: """Load a node config variant from *path*, or return the default config.""" if not path: - return { - "name": "default", - "nodeLabels": {}, - "registerWithTaints": [], - "nodeIP": "", - } + return NodeConfig(name="default", node_labels={}, register_with_taints=[]) config_path = Path(path) if not config_path.is_absolute(): @@ -244,33 +249,34 @@ def load_node_config(path: str | None) -> dict[str, Any]: if not isinstance(node_ip, str): die(f"node config {config_path} field 'nodeIP' must be a string") - return { - "name": name, - "nodeLabels": dict(node_labels), - "registerWithTaints": list(register_with_taints), - "nodeIP": node_ip, - } + return NodeConfig( + name=name, + node_labels=dict(node_labels), + register_with_taints=list(register_with_taints), + node_ip=node_ip, + path=str(config_path), + ) -def expected_node_labels(node_config: dict[str, Any]) -> dict[str, str]: +def expected_node_labels(node_config: NodeConfig) -> dict[str, str]: """Return labels configured for this e2e node variant.""" - return dict(node_config["nodeLabels"]) + return dict(node_config.node_labels) -def expected_node_taint_strings(node_config: dict[str, Any]) -> list[str]: +def expected_node_taint_strings(node_config: NodeConfig) -> list[str]: """Return configured taint strings for this e2e node variant.""" - return list(node_config["registerWithTaints"]) + return list(node_config.register_with_taints) -def expected_node_ip(node_config: dict[str, Any]) -> str: +def expected_node_ip(node_config: NodeConfig) -> str: """Return the expected Node InternalIP for this e2e node variant.""" - node_ip = node_config.get("nodeIP", "") + node_ip = node_config.node_ip if node_ip in ("$VM_IP", "${VM_IP}"): return VM_IP return node_ip or VM_IP -def expected_node_taints(node_config: dict[str, Any]) -> list[dict[str, str]]: +def expected_node_taints(node_config: NodeConfig) -> list[dict[str, str]]: """Return taints configured for this e2e node variant.""" taints: list[dict[str, str]] = [] for item in expected_node_taint_strings(node_config): @@ -287,11 +293,10 @@ def expected_node_taints(node_config: dict[str, Any]) -> list[dict[str, str]]: return taints -def node_config_bootstrap_args(node_config: dict[str, Any]) -> list[str]: +def node_config_bootstrap_args(node_config: NodeConfig) -> list[str]: """Return manual-bootstrap flags for the active node config variant.""" args: list[str] = [] - node_ip = node_config.get("nodeIP", "") - if node_ip: + if node_config.node_ip: args.extend(["--node-ip", expected_node_ip(node_config)]) for key, value in sorted(expected_node_labels(node_config).items()): args.extend(["--node-label", f"{key}={value}"]) @@ -300,13 +305,12 @@ def node_config_bootstrap_args(node_config: dict[str, Any]) -> list[str]: return args -def log_active_node_config(node_config: dict[str, Any]) -> None: +def log_active_node_config(node_config: NodeConfig) -> None: """Log the active e2e node config variant.""" labels = [f"{key}={value}" for key, value in sorted(expected_node_labels(node_config).items())] taints = expected_node_taint_strings(node_config) - node_ip = node_config.get("nodeIP", "") - log(f"Agent e2e node config variant: {node_config['name']}") - log(f" node ip: {expected_node_ip(node_config) if node_ip else ''}") + log(f"Agent e2e node config variant: {node_config.name}") + log(f" node ip: {expected_node_ip(node_config) if node_config.node_ip else ''}") log(f" node labels: {', '.join(labels) if labels else ''}") log(f" register-with-taints: {', '.join(taints) if taints else ''}") @@ -317,21 +321,19 @@ def _safe_name(value: str) -> str: return safe or "config" -def discover_node_configs() -> list[dict[str, Any]]: +def discover_node_configs() -> list[NodeConfig]: """Load all node config scenario files in deterministic order.""" - configs: list[dict[str, Any]] = [] + configs: list[NodeConfig] = [] for path in sorted(NODE_CONFIG_DIR.glob("*.json")): - cfg = load_node_config(str(path)) - cfg["_path"] = str(path) - configs.append(cfg) + configs.append(load_node_config(str(path))) if not configs: die(f"No node config scenarios found in {NODE_CONFIG_DIR}") return configs -def scenario_env(node_config: dict[str, Any], index: int) -> dict[str, str]: +def scenario_env(node_config: NodeConfig, index: int) -> dict[str, str]: """Return per-scenario environment overrides for a parallel e2e node.""" - name = _safe_name(node_config["name"]) + name = _safe_name(node_config.name) vm_name = f"{VM_NAME}-{name}" return { "VM_NAME": vm_name, @@ -1097,7 +1099,7 @@ def ensure_kind_bridge() -> None: # --------------------------------------------------------------------------- # run-agent # --------------------------------------------------------------------------- -def run_agent(node_config: dict[str, Any]) -> None: +def run_agent(node_config: NodeConfig) -> None: """Build agent, generate bootstrap script, and run it on the VM.""" if not SSH_KEY.exists(): @@ -1169,7 +1171,7 @@ def log_message(self, format: str, *args: Any) -> None: # noqa: A002 return Handler -def _run_agent_inner(agent_url: str, node_config: dict[str, Any]) -> None: +def _run_agent_inner(agent_url: str, node_config: NodeConfig) -> None: """Core logic for run-agent (after HTTP server is up).""" # Determine the Kind control-plane IP so connectivity checks have the @@ -1328,7 +1330,7 @@ def wait_for_node() -> None: # --------------------------------------------------------------------------- # validate-node-config # --------------------------------------------------------------------------- -def _assert_expected_node_config(node: dict[str, Any], node_config: dict[str, Any]) -> None: +def _assert_expected_node_config(node: dict[str, Any], node_config: NodeConfig) -> None: expected_labels = expected_node_labels(node_config) expected_taints = expected_node_taints(node_config) @@ -1358,7 +1360,7 @@ def _assert_expected_node_config(node: dict[str, Any], node_config: dict[str, An die(f"node InternalIP mismatch: got {internal_ips}, expected {node_ip!r}") -def validate_node_config(node_config: dict[str, Any]) -> None: +def validate_node_config(node_config: NodeConfig) -> None: """Verify configured node labels and taints are present on the Node.""" log_active_node_config(node_config) @@ -1371,20 +1373,20 @@ def validate_node_config(node_config: dict[str, Any]) -> None: kubectl(["get", "node", AGENT_MACHINE_NAME, "-o", "wide"]) -def _run_scenario_command(command: str, node_config: dict[str, Any], env: dict[str, str]) -> None: +def _run_scenario_command(command: str, node_config: NodeConfig, env: dict[str, str]) -> None: args = [sys.executable, str(Path(__file__))] if VERBOSE: args.append("--verbose") - if "_path" in node_config: - args.extend(["--node-config", node_config["_path"]]) + if node_config.path: + args.extend(["--node-config", node_config.path]) args.append(command) child_env = {**os.environ, **env} run(args, env=child_env) -def _validate_node_config_scenario(node_config: dict[str, Any], index: int, agent_url: str) -> None: - name = node_config["name"] +def _validate_node_config_scenario(node_config: NodeConfig, index: int, agent_url: str) -> None: + name = node_config.name env = scenario_env(node_config, index) env["AGENT_URL"] = agent_url @@ -1419,7 +1421,7 @@ def validate_node_config_scenarios() -> None: try: with concurrent.futures.ThreadPoolExecutor(max_workers=len(configs)) as executor: futures = { - executor.submit(_validate_node_config_scenario, cfg, index, agent_url): cfg["name"] + executor.submit(_validate_node_config_scenario, cfg, index, agent_url): cfg.name for index, cfg in enumerate(configs) } for future in concurrent.futures.as_completed(futures): @@ -2087,7 +2089,7 @@ def delete_machine_cr() -> None: # --------------------------------------------------------------------------- # validate-machine-cr-created # --------------------------------------------------------------------------- -def validate_machine_cr_created(node_config: dict[str, Any]) -> None: +def validate_machine_cr_created(node_config: NodeConfig) -> None: """Validate the agent self-registered a Machine CR during bootstrap. The daemon registers the Machine CR at startup, so this function polls @@ -2295,7 +2297,7 @@ def _next_patch_version(version: str) -> str: return "v" + ".".join(parts) -def validate_node_repave_upgrade(node_config: dict[str, Any]) -> None: +def validate_node_repave_upgrade(node_config: NodeConfig) -> None: """Validate OnDelete repave applies a new MCV Kubernetes version.""" config_name = MACHINE_CONFIG_NAME @@ -2439,7 +2441,7 @@ def collect_logs() -> None: if os.environ.get("COLLECT_NODE_CONFIG_LOGS", "").lower() == "true": for index, cfg in enumerate(discover_node_configs()): env = scenario_env(cfg, index) - prefix = f"{_safe_name(cfg['name'])}-" + prefix = f"{_safe_name(cfg.name)}-" _collect_one_vm_logs( logs_dir, env["VM_NAME"], @@ -2566,12 +2568,12 @@ def cleanup() -> None: # --------------------------------------------------------------------------- # main # --------------------------------------------------------------------------- -Command = Callable[[dict[str, Any]], None] +Command = Callable[[NodeConfig], None] def _without_node_config(func: Callable[[], None]) -> Command: """Adapt a command that does not use node config settings.""" - def command(_node_config: dict[str, Any]) -> None: + def command(_node_config: NodeConfig) -> None: func() return command From 0adc7ecd06bad72ecb8040490d730b2c574e4d99 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 20:03:15 +0000 Subject: [PATCH 08/14] Ensure machina setup creates e2e state dir Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/b02edbe4-11d3-4395-9282-950750113c50 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- hack/agent/e2e-kind/e2e.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 8e67312e..80d2cf45 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -1926,6 +1926,7 @@ def start_machina_controller() -> None: if not api_server: die("Could not determine API server URL from kubeconfig") + VM_DIR.mkdir(parents=True, exist_ok=True) MACHINA_CONFIG_FILE.write_text(textwrap.dedent(f"""\ apiServerEndpoint: {api_server} metricsAddr: "0" From f55274cf145bdb1c97438f4b939ad4ca147fedff Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 May 2026 05:03:57 +0000 Subject: [PATCH 09/14] Assign unique MACs to parallel e2e VMs Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/caa37a46-997e-4f3d-96de-d55c1710e5a8 Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- hack/agent/e2e-kind/e2e.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 80d2cf45..bd37ead8 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -49,6 +49,7 @@ import argparse import base64 import concurrent.futures +import hashlib import json import os import re @@ -321,6 +322,19 @@ def _safe_name(value: str) -> str: return safe or "config" +def qemu_mac_address() -> str: + """Return a stable, per-VM MAC address for the QEMU tap interface.""" + try: + octets = [int(part) for part in VM_IP.split(".")] + if len(octets) == 4 and all(0 <= part <= 255 for part in octets): + return f"52:54:00:{octets[1]:02x}:{octets[2]:02x}:{octets[3]:02x}" + except ValueError: + pass + + digest = hashlib.sha256(f"{VM_NAME}-{VM_IP}".encode()).digest() + return f"52:54:00:{digest[0]:02x}:{digest[1]:02x}:{digest[2]:02x}" + + def discover_node_configs() -> list[NodeConfig]: """Load all node config scenario files in deterministic order.""" configs: list[NodeConfig] = [] @@ -877,6 +891,7 @@ def _launch_vm(ssh_pub_key: str) -> None: # Launch QEMU VM pid_file = VM_DIR / f"{VM_NAME}.pid" qemu_log = VM_DIR / f"{VM_NAME}.log" + mac_address = qemu_mac_address() log("============================================") log(f" Launching VM: {VM_NAME}") @@ -884,6 +899,7 @@ def _launch_vm(ssh_pub_key: str) -> None: log(f" CPUs: {VM_CPUS}") log(f" Disk: {vm_disk}") log(f" IP: {VM_IP}") + log(f" MAC: {mac_address}") log(f" Bridge: {BRIDGE_NAME}") log(f" Log: {qemu_log}") log("============================================") @@ -895,7 +911,7 @@ def _launch_vm(ssh_pub_key: str) -> None: "-drive", f"file={vm_disk},format=qcow2,if=virtio", "-drive", f"file={seed_iso},format=raw,if=virtio", "-netdev", f"tap,id=net0,ifname={TAP_NAME},script=no,downscript=no", - "-device", "virtio-net-pci,netdev=net0", + "-device", f"virtio-net-pci,netdev=net0,mac={mac_address}", "-daemonize", "-pidfile", str(pid_file), "-serial", f"file:{qemu_log}", "-display", "none", From d4845d8527208b987b1100ea97b0bf46ca46d91b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 May 2026 07:21:09 +0000 Subject: [PATCH 10/14] Fix parallel agent config e2e isolation Agent-Logs-Url: https://github.com/Azure/unbounded/sessions/3ec2c073-d4c3-4d5b-994c-2390bb69403c Co-authored-by: bcho <1975118+bcho@users.noreply.github.com> --- hack/agent/e2e-kind/e2e.py | 49 +++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index bd37ead8..9a40b7ec 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -86,7 +86,7 @@ AGENT_DEBUG = os.environ.get("AGENT_DEBUG", "") # Site name used when generating the bootstrap script via kubectl-unbounded. -E2E_SITE_NAME = "e2e" +E2E_SITE_NAME = os.environ.get("E2E_SITE_NAME", "e2e") # Fixed nspawn machine names used by unbounded-agent (decoupled from the kube node name). NSPAWN_MACHINE_NAMES = ["kube1", "kube2"] @@ -352,6 +352,7 @@ def scenario_env(node_config: NodeConfig, index: int) -> dict[str, str]: return { "VM_NAME": vm_name, "AGENT_MACHINE_NAME": vm_name, + "E2E_SITE_NAME": f"{E2E_SITE_NAME}-{name}", "VM_IP": f"{VM_SUBNET}.{10 + index}", "VM_DIR": str(VM_DIR / name), "TAP_NAME": f"tap-e2e-{index}", @@ -2314,6 +2315,51 @@ def _next_patch_version(version: str) -> str: return "v" + ".".join(parts) +def ensure_machine_configuration_for_repave( + config_name: str, + kubernetes_version: str, + node_config: NodeConfig, +) -> None: + """Create the per-machine MachineConfiguration if setup did not pre-create it.""" + + result = subprocess.run( + [KUBECTL, "get", "machineconfiguration", config_name], + capture_output=True, + text=True, + ) + if result.returncode == 0: + return + + output = result.stdout + result.stderr + if "NotFound" not in output and "not found" not in output: + die(f"failed to get MachineConfiguration '{config_name}': {output.strip()}") + + log(f"Creating MachineConfiguration '{config_name}' for repave validation...") + kubernetes_template: dict[str, Any] = {"version": kubernetes_version} + labels = expected_node_labels(node_config) + taints = expected_node_taints(node_config) + if labels: + kubernetes_template["nodeLabels"] = labels + if taints: + kubernetes_template["registerWithTaints"] = taints + + manifest = { + "apiVersion": "unbounded-cloud.io/v1alpha3", + "kind": "MachineConfiguration", + "metadata": { + "name": config_name, + "labels": {"e2e.unbounded-cloud.io/test": "agent-kind"}, + }, + "spec": { + "updateStrategy": {"type": "OnDelete"}, + "template": { + "kubernetes": kubernetes_template, + }, + }, + } + kubectl(["apply", "-f", "-"], input=json.dumps(manifest).encode()) + + def validate_node_repave_upgrade(node_config: NodeConfig) -> None: """Validate OnDelete repave applies a new MCV Kubernetes version.""" @@ -2328,6 +2374,7 @@ def validate_node_repave_upgrade(node_config: NodeConfig) -> None: log(f"Current kubelet version: {current_kubelet_version}") log(f"Target kubelet version: {target_kubelet_version}") + ensure_machine_configuration_for_repave(config_name, current_kubelet_version, node_config) manifest = json.loads(kubectl_capture(["get", "machineconfiguration", config_name, "-o", "json"])) metadata = manifest.setdefault("metadata", {}) for key in ["creationTimestamp", "generation", "resourceVersion", "uid", "managedFields"]: From 44161ccb214aa5edc9df32b90e1fd3240927db30 Mon Sep 17 00:00:00 2001 From: hbc Date: Thu, 21 May 2026 13:29:32 -0700 Subject: [PATCH 11/14] Stabilize agent config e2e node readiness --- hack/agent/e2e-kind/e2e.py | 42 +++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 9a40b7ec..0ade5923 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -491,11 +491,46 @@ def node_kubelet_version(node_name: str) -> str: ]).strip() +def restart_crashing_daemonset_pods(node_name: str, namespace: str, label: str) -> None: + """Delete matching DaemonSet pods stuck in restart backoff on *node_name*.""" + + result = subprocess.run( + [KUBECTL, "get", "pods", "-n", namespace, + "-l", label, "--field-selector", f"spec.nodeName={node_name}", + "-o", "json"], + capture_output=True, text=True, + ) + if result.returncode != 0: + return + + pods = json.loads(result.stdout).get("items", []) + for pod in pods: + pod_name = pod["metadata"]["name"] + for container_status in pod.get("status", {}).get("containerStatuses", []): + if container_status.get("ready"): + continue + waiting = container_status.get("state", {}).get("waiting", {}) + terminated = container_status.get("state", {}).get("terminated", {}) + restart_count = container_status.get("restartCount", 0) + waiting_reason = waiting.get("reason") + terminated_reason = terminated.get("reason") + if restart_count >= 2 or waiting_reason == "CrashLoopBackOff": + log(f" Deleting crashing pod {pod_name} " + f"(restarts={restart_count}, waiting={waiting_reason or 'none'}, " + f"terminated={terminated_reason or 'none'}) to reset backoff") + subprocess.run( + [KUBECTL, "delete", "pod", "-n", namespace, pod_name, + "--grace-period=0", "--force"], + capture_output=True, text=True, + ) + + def wait_for_node_ready(node_name: str, timeout_secs: int = 120) -> None: """Wait until *node_name* reports Ready=True.""" log(f"Waiting for node '{node_name}' to be Ready (timeout: {timeout_secs}s)...") elapsed = 0 + last_restart_attempt = 0 while elapsed < timeout_secs: result = subprocess.run( [KUBECTL, "get", "node", node_name, @@ -508,6 +543,9 @@ def wait_for_node_ready(node_name: str, timeout_secs: int = 120) -> None: return if elapsed > 0 and elapsed % 30 == 0: log(f" ({elapsed}s) Node not yet Ready (status: {status})") + if elapsed >= 30 and elapsed - last_restart_attempt >= 30: + restart_crashing_daemonset_pods(node_name, "kube-system", "app=kindnet") + last_restart_attempt = elapsed time.sleep(5) elapsed += 5 @@ -1314,7 +1352,7 @@ def wait_for_node() -> None: """Wait for the agent node to appear and become Ready.""" node_timeout = int(os.environ.get("NODE_TIMEOUT", "180")) - ready_timeout = int(os.environ.get("READY_TIMEOUT", "120")) + ready_timeout = int(os.environ.get("READY_TIMEOUT", "720")) # Wait for node to appear log(f"Waiting for node '{AGENT_MACHINE_NAME}' to appear (timeout: {node_timeout}s)...") @@ -2523,6 +2561,8 @@ def collect_logs() -> None: _write_command_log(logs_dir / "nodes-describe.txt", [KUBECTL, "describe", "nodes"]) _write_command_log(logs_dir / "pods.txt", [KUBECTL, "get", "pods", "-A", "-o", "wide"]) _write_command_log(logs_dir / "events.txt", [KUBECTL, "get", "events", "-A", "--sort-by=.lastTimestamp"]) + _write_command_log(logs_dir / "kindnet.log", [KUBECTL, "logs", "-n", "kube-system", "--all-containers", "--prefix", "-l", "app=kindnet"]) + _write_command_log(logs_dir / "kindnet-previous.log", [KUBECTL, "logs", "-n", "kube-system", "--all-containers", "--prefix", "--previous", "-l", "app=kindnet"]) _write_command_log(logs_dir / "machines.txt", [KUBECTL, "get", "machines", "-o", "wide"]) _write_command_log(logs_dir / "machines-full.yaml", [KUBECTL, "get", "machines", "-o", "yaml"]) _write_command_log(logs_dir / "machineconfigurations.txt", [KUBECTL, "get", "machineconfigurations", "-o", "wide"]) From 3e3955c3b879eb1f894a7ea6127a6ea192f559e3 Mon Sep 17 00:00:00 2001 From: hbc Date: Thu, 21 May 2026 13:51:00 -0700 Subject: [PATCH 12/14] Avoid logging agent bootstrap script --- hack/agent/e2e-kind/e2e.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 0ade5923..8f3c478e 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -1316,8 +1316,6 @@ def _run_agent_inner(agent_url: str, node_config: NodeConfig) -> None: bootstrap_script_path.write_text(bootstrap_script) bootstrap_script_path.chmod(0o600) log(f"Bootstrap script written to {bootstrap_script_path}") - log("Bootstrap script contents:") - print(bootstrap_script, flush=True) # Wait for cloud-init and verify connectivity log("Waiting for cloud-init to complete on VM...") From 33121d6bde58c4e9deac7e69985d6deb3bac505a Mon Sep 17 00:00:00 2001 From: hbc Date: Thu, 21 May 2026 14:04:55 -0700 Subject: [PATCH 13/14] Remove sensitive e2e log values --- hack/agent/e2e-kind/e2e.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 8f3c478e..6652a5dc 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -983,7 +983,7 @@ def _launch_vm(ssh_pub_key: str) -> None: die(f"SSH did not become available after {max_attempts} attempts. Check log: {qemu_log}") print(flush=True) - log(f"VM is ready! SSH: ssh -i {SSH_KEY} ubuntu@{VM_IP}") + log(f"VM is ready at {VM_IP}") # --------------------------------------------------------------------------- @@ -1274,7 +1274,7 @@ def _run_agent_inner(agent_url: str, node_config: NodeConfig) -> None: }, }) kubectl(["apply", "-f", "-"], input=token_manifest.encode()) - log(f"Bootstrap token created: {token_id}.xxxxxxxxxxxxxxxx") + log("Bootstrap token created") # Generate bootstrap script via kubectl-unbounded. # manual-bootstrap auto-detects the API server, CA cert, Kubernetes @@ -2196,7 +2196,7 @@ def validate_machine_cr_created(node_config: NodeConfig) -> None: if token_ref != expected_ref: die(f"bootstrapTokenRef mismatch: got '{token_ref}', expected '{expected_ref}'") - log(f"bootstrapTokenRef is correct: {token_ref}") + log("bootstrapTokenRef is correct") expected_labels = expected_node_labels(node_config) actual_labels = k8s_spec.get("nodeLabels") or {} From dc184373aee36f366bb5d50b2166966b39795b02 Mon Sep 17 00:00:00 2001 From: hbc Date: Thu, 21 May 2026 14:16:38 -0700 Subject: [PATCH 14/14] Avoid logging e2e status message bodies --- hack/agent/e2e-kind/e2e.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/agent/e2e-kind/e2e.py b/hack/agent/e2e-kind/e2e.py index 6652a5dc..3ce98336 100755 --- a/hack/agent/e2e-kind/e2e.py +++ b/hack/agent/e2e-kind/e2e.py @@ -413,7 +413,7 @@ def wait_for_machine_operation_complete(name: str, timeout_secs: int = 180) -> d phase = status.get("phase", "") message = status.get("message", "") if phase != last_phase or message != last_message: - log(f" MachineOperation phase={phase or ''} message={message or ''}") + log(f" MachineOperation phase={phase or ''}") last_phase = phase last_message = message if phase == "Complete": @@ -455,7 +455,7 @@ def wait_for_machine_operation_failed( phase = status.get("phase", "") message = status.get("message", "") if phase != last_phase or message != last_message: - log(f" MachineOperation phase={phase or ''} message={message or ''}") + log(f" MachineOperation phase={phase or ''}") last_phase = phase last_message = message if phase == "Failed": @@ -643,7 +643,7 @@ def wait_for_node_reboot_event(node_name: str, boot_id: str, timeout_secs: int = for event in events: message = event.get("message", "") if boot_id in message: - log(f"Observed Node Rebooted event: {message}") + log("Observed Node Rebooted event") return if elapsed > 0 and elapsed % 30 == 0: