misc: run acceptance tests from github actions. (#497)

jmcarp · Josh Carp · web-flow · commit fb7e507f7d54 · 2025-10-07T12:38:16.000-04:00
Builds on #494. --------- Co-authored-by: Josh Carp <joshcarp@mac.lan>
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -29,3 +29,70 @@ jobs:
         run: make test
       - name: lint
         run: make lint
+  acceptance:
+    runs-on: ubuntu-latest
+    steps:
+      # Simulated omicron takes up a meaningful amount of disk space, and the
+      # hosted Github Actions runners don't offer much space. Clean up unused
+      # dependencies so that we don't run out of disk. Borrowed from
+      # https://carlosbecker.com/posts/github-actions-disk-space.
+      - name: "cleanup"
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          sudo docker builder prune -a
+      - uses: actions/checkout@v5
+      - uses: hashicorp/setup-terraform@v3
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: 'go.mod'
+      - uses: docker/setup-compose-action@v1
+      - uses: astral-sh/setup-uv@v6
+      - name: install oxide cli
+        run: |
+          mkdir -p bin
+          wget https://github.com/oxidecomputer/oxide.rs/releases/download/v0.13.0+20250730.0.0/oxide-cli-x86_64-unknown-linux-gnu.tar.xz
+          tar xvf oxide-cli-x86_64-unknown-linux-gnu.tar.xz
+          mv oxide-cli-x86_64-unknown-linux-gnu/oxide bin
+          echo "$(pwd)/bin" >> $GITHUB_PATH
+      # Run simulated omicron in the background with docker compose.
+      # TODO(jmcarp): support tests against multiple omicron versions.
+      # TODO(jmcarp): publish this image for faster builds.
+      - name: omicron-dev
+        working-directory: acctest
+        run: |
+          docker compose build
+          if ! docker compose up --wait --wait-timeout 1500; then
+            docker compose logs
+            exit 1
+          fi
+      # We can't use `oxide auth login` here, since it requires a browser to
+      # complete the oauth device flow. Instead, fetch an auth token using a
+      # script that simulates the browser flow.
+      - id: auth-token
+        working-directory: acctest
+        run: |
+          echo "OXIDE_TOKEN=$(uv run auth.py)" >> $GITHUB_OUTPUT
+      # Create oxide resources necessary for acceptance tests, including an
+      # arbitrary small image.
+      - name: oxide-dependencies
+        run: |
+          # Install qemu, which we'll use to build a sample image.
+          sudo apt-get update && sudo apt-get install -y qemu-utils
+
+          if ! ./scripts/acc-test-setup.sh; then
+            docker compose logs
+            exit 1
+          fi
+        env:
+          OXIDE_HOST: http://localhost:12220
+          OXIDE_TOKEN: ${{ steps.auth-token.outputs.OXIDE_TOKEN }}
+      - name: test
+        shell: bash
+        run: |
+          make testacc
+        env:
+          OXIDE_HOST: http://localhost:12220
+          OXIDE_TOKEN: ${{ steps.auth-token.outputs.OXIDE_TOKEN }}
+          OXIDE_TEST_IP_POOL_NAME: default
+          OXIDE_SILO_DNS_NAME: "*.sys.oxide-dev.test"
diff --git a/acctest/Dockerfile b/acctest/Dockerfile
@@ -0,0 +1,13 @@
+FROM rust
+
+SHELL ["/bin/bash", "-c"]
+
+RUN \
+  git clone https://github.com/oxidecomputer/omicron.git --branch main --depth 1 && \
+  cd omicron && \
+  source env.sh && \
+  ./tools/install_builder_prerequisites.sh -y -s
+
+COPY nexus-config.toml omicron
+
+WORKDIR omicron
diff --git a/acctest/auth.py b/acctest/auth.py
@@ -0,0 +1,53 @@
+# /// script
+# dependencies = [
+#   "httpx",
+# ]
+# ///
+
+import argparse
+import uuid
+
+import httpx
+
+
+def fetch_token(args):
+    # Get auth cookie via password login.
+    client = httpx.Client()
+    client.post(
+        f"{args.host}/v1/login/{args.silo}/local",
+        json={"username": args.username, "password": args.password},
+    ).raise_for_status()
+
+    # Start the device auth flow.
+    u = uuid.uuid4()
+    device_resp = httpx.post(f"{args.host}/device/auth", data={"client_id": str(u)})
+    device_resp.raise_for_status()
+    device_details = device_resp.json()
+
+    # Confirm the device via the authenticated session.
+    client.post(
+        f"{args.host}/device/confirm", json={"user_code": device_details["user_code"]}
+    ).raise_for_status()
+
+    # Fetch the token.
+    token_resp = httpx.post(
+        f"{args.host}/device/token",
+        data={
+            "grant_type": "urn:ietf:params:oauth:grant-type:device_code",
+            "device_code": device_details["device_code"],
+            "client_id": str(u),
+        },
+    )
+    token_resp.raise_for_status()
+    return token_resp.json()["access_token"]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="http://localhost:12220")
+    parser.add_argument("--silo", default="test-suite-silo")
+    parser.add_argument("--username", default="test-privileged")
+    parser.add_argument("--password", default="oxide")
+    args = parser.parse_args()
+
+    print(fetch_token(args))
diff --git a/acctest/docker-compose.yaml b/acctest/docker-compose.yaml
@@ -0,0 +1,10 @@
+services:
+  omicron-dev:
+    build: .
+    command: /bin/bash -c 'source env.sh && cargo xtask omicron-dev run-all --nexus-config ./nexus-config.toml'
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:12220"]
+      interval: 5s
+      retries: 300
+    ports:
+    - "12220:12220"
diff --git a/acctest/nexus-config.toml b/acctest/nexus-config.toml
@@ -0,0 +1,164 @@
+#
+# Oxide API: example configuration file
+#
+
+[console]
+# Directory for static assets. Absolute path or relative to CWD.
+static_dir = "out/console-assets"
+session_idle_timeout_minutes = 480 # 8 hours
+session_absolute_timeout_minutes = 1440 # 24 hours
+
+# List of authentication schemes to support.
+[authn]
+schemes_external = ["session_cookie", "access_token"]
+
+[log]
+# Show log messages of this level and more severe
+level = "info"
+
+# Example output to a terminal (with colors)
+mode = "stderr-terminal"
+
+# Example output to a file, appending if it already exists.
+#mode = "file"
+#path = "logs/server.log"
+#if_exists = "append"
+
+# Configuration for interacting with the timeseries database
+[timeseries_db]
+address = "[::1]:8123"
+
+[deployment]
+# Identifier for this instance of Nexus
+id = "e6bff1ff-24fb-49dc-a54e-c6a350cd4d6c"
+rack_id = "c19a698f-c6f9-4a17-ae30-20d711b8f7dc"
+
+# Nexus may need to resolve external hosts (e.g. to grab IdP metadata).
+# These are the DNS servers it should use.
+external_dns_servers = ["1.1.1.1", "9.9.9.9"]
+
+[deployment.dropshot_external]
+# IP Address and TCP port on which to listen for the external API
+bind_address = "0.0.0.0:12220"
+default_request_body_max_bytes = 1048576
+# To have Nexus's external HTTP endpoint use TLS, uncomment the line below.  You
+# will also need to provide an initial TLS certificate during rack
+# initialization.  If you're using this config file, you're probably running a
+# simulated system.  In that case, the initial certificate is provided to the
+# simulated sled agent (acting as RSS) via command-line arguments.
+#tls = true
+
+[deployment.dropshot_internal]
+# IP Address and TCP port on which to listen for the internal API
+bind_address = "[::1]:12221"
+default_request_body_max_bytes = 1048576
+
+[deployment.dropshot_lockstep]
+# IP Address and TCP port on which to listen for the lockstep API
+bind_address = "[::1]:12232"
+default_request_body_max_bytes = 1048576
+
+[deployment.internal_dns]
+# Example address.
+# If you're using `omicron-dev run-all`, this is value is overwritten
+# by the address / port created after starting the Internal DNS server.
+type = "from_address"
+address = "[::1]:3535"
+
+[deployment.database]
+# URL for connecting to the database
+type = "from_url"
+url = "postgresql://root@[::1]:32221/omicron?sslmode=disable"
+
+# Tunable configuration parameters, for testing or experimentation
+[tunables]
+
+# The maximum allowed prefix (thus smallest size) for a VPC Subnet's
+# IPv4 subnetwork. This size allows for ~60 hosts.
+max_vpc_ipv4_subnet_prefix = 26
+
+# Configuration for interacting with the dataplane daemon
+[dendrite.switch0]
+address = "[::1]:12224"
+
+[background_tasks]
+dns_internal.period_secs_config = 60
+dns_internal.period_secs_servers = 60
+dns_internal.period_secs_propagation = 60
+dns_internal.max_concurrent_server_updates = 5
+dns_external.period_secs_config = 60
+dns_external.period_secs_servers = 60
+dns_external.period_secs_propagation = 60
+dns_external.max_concurrent_server_updates = 5
+metrics_producer_gc.period_secs = 60
+# How frequently we check the list of stored TLS certificates.  This is
+# approximately an upper bound on how soon after updating the list of
+# certificates it will take _other_ Nexus instances to notice and stop serving
+# them (on a sunny day).
+external_endpoints.period_secs = 60
+nat_cleanup.period_secs = 30
+bfd_manager.period_secs = 30
+# How frequently to collect hardware/software inventory from the whole system
+# (even if we don't have reason to believe anything has changed).
+inventory.period_secs = 600
+# Maximum number of past collections to keep in the database
+inventory.nkeep = 5
+# Disable inventory collection altogether (for emergencies)
+inventory.disable = false
+phantom_disks.period_secs = 30
+physical_disk_adoption.period_secs = 30
+support_bundle_collector.period_secs = 30
+decommissioned_disk_cleaner.period_secs = 60
+blueprints.period_secs_load = 10
+blueprints.period_secs_plan = 60
+blueprints.period_secs_execute = 60
+blueprints.period_secs_rendezvous = 300
+blueprints.period_secs_collect_crdb_node_ids = 180
+blueprints.period_secs_load_reconfigurator_config = 5
+sync_service_zone_nat.period_secs = 30
+switch_port_settings_manager.period_secs = 30
+region_replacement.period_secs = 30
+region_replacement_driver.period_secs = 30
+# How frequently to query the status of active instances.
+instance_watcher.period_secs = 30
+# How frequently to schedule new instance update sagas.
+instance_updater.period_secs = 30
+# How frequently to attempt to restart Failed instances?
+instance_reincarnation.period_secs = 60
+service_firewall_propagation.period_secs = 300
+v2p_mapping_propagation.period_secs = 30
+abandoned_vmm_reaper.period_secs = 60
+saga_recovery.period_secs = 600
+lookup_region_port.period_secs = 60
+region_snapshot_replacement_start.period_secs = 30
+region_snapshot_replacement_garbage_collection.period_secs = 30
+region_snapshot_replacement_step.period_secs = 30
+region_snapshot_replacement_finish.period_secs = 30
+tuf_artifact_replication.period_secs = 300
+tuf_artifact_replication.min_sled_replication = 1
+tuf_repo_pruner.period_secs = 300
+# How many extra recent target releases to keep
+# The system always keeps two: the current release and the previous one.
+# This number is in addition to that.
+tuf_repo_pruner.nkeep_extra_target_releases = 1
+# How many extra recently uploaded repos to keep
+# The system always keeps one, assuming that the operator may be about to
+# update to it.  This number is in addition to that.
+tuf_repo_pruner.nkeep_extra_newly_uploaded = 1
+# In general, the webhook dispatcher will be activated when events are queued,
+# so we don't need to periodically activate it *that* frequently.
+alert_dispatcher.period_secs = 60
+webhook_deliverator.period_secs = 60
+read_only_region_replacement_start.period_secs = 30
+sp_ereport_ingester.period_secs = 30
+
+[default_region_allocation_strategy]
+# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
+# type = "random_with_distinct_sleds"
+
+# the same as random_with_distinct_sleds, but without requiring distinct sleds
+type = "random"
+
+# setting `seed` to a fixed value will make dataset selection ordering use the
+# same shuffling order for every region allocation.
+# seed = 0
diff --git a/scripts/acc-test-setup.sh b/scripts/acc-test-setup.sh
@@ -10,8 +10,8 @@ PROJECT_NAME=${OXIDE_PROJECT:-tf-acc-test}
 # Default to test-suite-silo, the silo used by omicron-dev.
 SILO_NAME=${OXIDE_SILO:-test-suite-silo}
 
+# Build a sample image, if not specified by caller.
 IMAGE_PATH=${OXIDE_IMAGE_PATH:-alpine.raw}
-
 if [ ! -e "$IMAGE_PATH" ]; then
     curl -L -o alpine.qcow2 https://dl-cdn.alpinelinux.org/alpine/v3.22/releases/cloud/generic_alpine-3.22.1-x86_64-bios-tiny-r0.qcow2
     qemu-img convert -f qcow2 -O raw alpine.qcow2 "$IMAGE_PATH"