diff --git a/.github/workflows/self-test-reference.yml b/.github/workflows/self-test-reference.yml new file mode 100644 index 0000000..7509478 --- /dev/null +++ b/.github/workflows/self-test-reference.yml @@ -0,0 +1,105 @@ +name: Self-test reference check + +on: + repository_dispatch: + types: + - self-test-reference-source-updated + pull_request: + paths: + - 'host/self-test-reference.mdx' + - 'host/how-to-self-test.mdx' + - 'host/verification-stages.mdx' + - 'scripts/generate_self_test_reference.py' + - 'docs.json' + - 'package.json' + - '.github/workflows/self-test-reference.yml' + workflow_dispatch: + inputs: + vast_cli_ref: + description: 'vast-ai/vast-cli ref to generate from' + required: false + default: 'master' + self_test_ref: + description: 'vast-ai/self-test ref to generate from' + required: false + default: 'main' + +jobs: + verify-self-test-reference: + runs-on: ubuntu-latest + steps: + - name: Checkout docs + uses: actions/checkout@v4 + with: + path: docs + + - name: Decide whether private source repos can be checked out + id: source-access + env: + SOURCE_TOKEN: ${{ secrets.VAST_DOCS_SOURCE_TOKEN }} + HEAD_REPO: ${{ github.event.pull_request.head.repo.full_name || github.repository }} + run: | + if [ "${{ github.event_name }}" = "pull_request" ] && [ "$HEAD_REPO" != "${{ github.repository }}" ] && [ -z "$SOURCE_TOKEN" ]; then + echo "can_verify=false" >> "$GITHUB_OUTPUT" + echo "::notice::Skipping self-test reference sync check for fork PR because VAST_DOCS_SOURCE_TOKEN is unavailable. Run workflow_dispatch or use an upstream branch to verify private source repos." + else + echo "can_verify=true" >> "$GITHUB_OUTPUT" + fi + + - name: Checkout vast-cli source + if: steps.source-access.outputs.can_verify == 'true' + uses: actions/checkout@v4 + with: + repository: vast-ai/vast-cli + ref: ${{ github.event.client_payload.vast_cli_ref || github.event.inputs.vast_cli_ref || 'master' }} + path: vast-cli-source + token: ${{ secrets.VAST_DOCS_SOURCE_TOKEN || github.token }} + + - name: Checkout self-test source + if: steps.source-access.outputs.can_verify == 'true' + uses: actions/checkout@v4 + with: + repository: vast-ai/self-test + ref: ${{ github.event.client_payload.self_test_ref || github.event.inputs.self_test_ref || 'main' }} + path: self-test-source + token: ${{ secrets.VAST_DOCS_SOURCE_TOKEN || github.token }} + + - uses: actions/setup-python@v5 + if: steps.source-access.outputs.can_verify == 'true' + with: + python-version: '3.11' + + - name: Regenerate self-test reference + if: steps.source-access.outputs.can_verify == 'true' + working-directory: docs + run: | + PYTHONDONTWRITEBYTECODE=1 npm run generate-self-test-reference -- \ + --vast-cli ../vast-cli-source \ + --self-test ../self-test-source + + - name: Fail if generated page is out of sync + if: steps.source-access.outputs.can_verify == 'true' + working-directory: docs + run: | + if ! git diff --quiet host/self-test-reference.mdx; then + echo "::error::host/self-test-reference.mdx is out of sync with vast-ai/vast-cli or vast-ai/self-test." + echo "::error::Run 'npm run generate-self-test-reference -- --vast-cli PATH_TO_VAST_CLI --self-test PATH_TO_SELF_TEST' and commit the result." + git diff --stat host/self-test-reference.mdx + git diff -- host/self-test-reference.mdx + exit 1 + fi + + - name: Fail if Python bytecode was generated + if: steps.source-access.outputs.can_verify == 'true' + working-directory: docs + run: | + if find scripts -name '__pycache__' -o -name '*.pyc' | grep -q .; then + echo "::error::Generated Python bytecode should not be committed or produced by the docs generation check." + find scripts -name '__pycache__' -o -name '*.pyc' + exit 1 + fi + + - name: Report skipped private-source validation + if: steps.source-access.outputs.can_verify != 'true' + run: | + echo "Self-test reference sync check skipped because this fork PR cannot access private source repositories." diff --git a/README.md b/README.md index c5dc285..b53adb4 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,36 @@ API endpoint specs live in `api-reference/openapi/yaml/` (one file per endpoint) 4. Preview locally: `mint dev`. 5. Commit both your YAML edit AND the regenerated `api-reference/openapi.yaml`, then open a PR. CI re-runs the build and fails if `openapi.yaml` is out of sync with the sources. +## Updating the self-test reference + +The host self-test reference page is generated from the Vast CLI diagnostics and the self-test image metadata. + +1. Update the relevant self-test source in `vast-ai/vast-cli` or `vast-ai/self-test`. +2. Regenerate the docs page: + +```bash +npm run generate-self-test-reference -- --vast-cli ../vast-cli --self-test ../self-test +``` + +3. Review and commit `host/self-test-reference.mdx` with the source change or in the matching docs PR. +4. CI re-runs the generator against `vast-ai/vast-cli` and `vast-ai/self-test` and fails if the committed page is out of sync. + +For cross-repo private checkouts, configure the docs repository secret `VAST_DOCS_SOURCE_TOKEN` with read access to `vast-ai/vast-cli` and `vast-ai/self-test`. The workflow can also be run manually with custom `vast_cli_ref` and `self_test_ref` inputs while a source PR is under review. + +Source repositories can trigger the docs check after self-test metadata changes by sending a repository dispatch event to `vast-ai/docs`: + +```json +{ + "event_type": "self-test-reference-source-updated", + "client_payload": { + "vast_cli_ref": "master", + "self_test_ref": "main" + } +} +``` + +Use the exact branch or SHA being validated when triggering this from a source PR workflow. + ## Mintlify Information **[Follow the full quickstart guide](https://starter.mintlify.com/quickstart)** diff --git a/docs.json b/docs.json index 1956f75..095f432 100644 --- a/docs.json +++ b/docs.json @@ -601,6 +601,7 @@ "pages": [ "host/verification-stages", "host/how-to-self-test", + "host/self-test-reference", "host/market-metrics", "host/datacenter-status", "host/payment", diff --git a/host/how-to-self-test.mdx b/host/how-to-self-test.mdx index 8eed828..9e3e500 100644 --- a/host/how-to-self-test.mdx +++ b/host/how-to-self-test.mdx @@ -34,6 +34,8 @@ During the self-test, the following components and conditions are verified: A short test workload will be executed to assess actual runtime performance. +For the exact thresholds, runtime stages, failure codes, and diagnostic bundle behavior, see the [Self-test Reference](/host/self-test-reference). + **Tip:** Ensure no other jobs or instances are running during the self-test for the most accurate results. @@ -90,11 +92,25 @@ If the test fails: * The CLI will display detailed reasons for failure. * Apply the suggested fixes and rerun the test. +* Review the [Self-test Reference](/host/self-test-reference) for the failed check or failure code. If the test says the machine is "not found or not rentable": -* Try un-listing your machine, then listing it again. -* Ensure your machine has no missing data in your machines page, such as upload and download speed, RAM, or ports. +* Check whether the machine is currently rented, offline, unlisted, below the reliability threshold, missing active offers, or hidden from the API key you are using. +* Inspect all offers for the machine: + +```bash +vastai search offers 'machine_id= rentable=any rented=any' +``` + +* Review the [Self-test Reference](/host/self-test-reference#not-found-or-not-rentable) for the specific root states the CLI can report. + +If the test reports no response or a progress timeout: + +* Check the external IP and port printed by the CLI, if present. +* Confirm your router/firewall forwards that external port to the host's LAN IP. +* If you are running the CLI from the same LAN as the host, retest from another network to rule out NAT hairpinning. +* Review the diagnostic bundle path printed by the CLI after failure. ### Optional: Ignore Requirements Mode @@ -118,3 +134,5 @@ Even if the test passes in this mode, your machine does not meet the minimum ver **Important:** Even with `--ignore-requirements`, your machine must have at least three direct open ports, otherwise, the self-test will fail. + +With `--ignore-requirements`, a passing result means the internal runtime workload passed. It does not mean the machine passed the minimum verification requirements. diff --git a/host/self-test-reference.mdx b/host/self-test-reference.mdx new file mode 100644 index 0000000..123e611 --- /dev/null +++ b/host/self-test-reference.mdx @@ -0,0 +1,221 @@ +--- +title: "Verification / Self-test Reference" +sidebarTitle: "Self-test Reference" +description: "Generated reference for host self-test checks, thresholds, failure codes, and guidance." +"canonical": "/host/self-test-reference" +--- + +{/* + This page is generated by scripts/generate_self_test_reference.py. + Do not edit this file by hand; update the Vast CLI/self-test source metadata, then regenerate. + Source: vast-ai/vast-cli upstream/master@e1a8d10 dirty=no. + Source: vast-ai/self-test upstream/main@6f93fc4 dirty=no. +*/} + +The host self-test is the quickest way to check whether a listed machine can pass Vast.ai's minimum verification gate and run the runtime workload used by the tester. + +When you run `vastai self-test machine `, the CLI selects a rentable offer for that machine, checks minimum requirements, rents one temporary diagnostic instance, starts the self-test image, polls the runtime progress endpoint, reports the result, and destroys the temporary instance. + + +Passing this self-test makes a machine eligible for verification, but it does not guarantee that the machine will be verified immediately. Verification also depends on ongoing health, reliability, supply and demand, and platform policy. + + + +`--ignore-requirements` is for dogfooding only. A pass with requirement checks ignored does not qualify the machine for verification. + + +## How To Read The Result + +Self-test output has two distinct parts: preflight qualification checks and the runtime workload. + +| Result | What it means | What to do next | +| --- | --- | --- | +| Normal pass | Minimum requirements passed and the runtime workload passed. The machine is eligible for verification, subject to the normal platform verification process. | Keep the host stable and listed. Verification is still automated and not guaranteed immediately. | +| Normal preflight failure | The CLI found one or more requirement failures before renting a temporary instance. | Fix the measured values shown in the CLI, then rerun without `--ignore-requirements`. | +| Runtime failure | The CLI rented a temporary instance, started the self-test image, and a runtime stage failed or timed out. | Use the failure code, last runtime stage, and diagnostic bundle to identify the failing subsystem. | +| Pass with `--ignore-requirements` | The runtime workload passed, but minimum requirement checks were skipped. This does not qualify the machine for verification. | Treat this as dogfood/runtime validation only. Rerun without `--ignore-requirements` to see qualification status. | + + +If you use `--ignore-requirements`, still review any requirement diagnostics from a normal run. A runtime pass proves the container workload can run; it does not prove the machine meets the verification gate. + + +## Preflight Checks + +These checks run before the CLI rents the temporary self-test instance. Failed required checks stop the normal flow before billing starts. + +| Check | Gate | Purpose | Host guidance | +| --- | --- | --- | --- | +| `cuda.version`
CUDA version | Required: CUDA version >= 11.8 | The self-test image needs a host driver stack compatible with CUDA 11.8 or newer. | Update the NVIDIA driver/CUDA stack, then confirm with: vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any' | +| `reliability`
Reliability | Required: Reliability > 0.90 | Self-test uses reliability as a guardrail before launching a temporary instance. | Let the host stabilize and resolve recent failures before retrying the self-test. | +| `network.direct_ports`
Direct port count | Required: Direct ports >= 3 * listed GPUs | The tester needs at least 3 directly mapped ports per listed GPU for remote progress, SSH checks, and runtime port allocation. | Open at least 3 direct ports for the listed GPU count, check firewall/NAT forwarding, and make TCP/UDP forwarding symmetric where required. | +| `pcie.bandwidth`
PCIe bandwidth | Required: PCIe bandwidth > 2.85 GB/s | Low PCIe bandwidth can make GPU stress and transfer checks fail or time out. | Check BIOS PCIe generation/lane settings and confirm GPUs are seated in full-speed slots. | +| `network.download`
Download speed | Required: Download >= min(500, max(100, 500 * total_vram_gib / 192)) Mb/s | The bandwidth floor scales with total VRAM so large GPU hosts can complete data movement tests. | Improve host download bandwidth or reduce contention, then rerun the Vast host verification. | +| `network.upload`
Upload speed | Required: Upload >= min(500, max(100, 500 * total_vram_gib / 192)) Mb/s | The tester needs enough upload bandwidth to report progress and complete network checks. | Improve host upload bandwidth or reduce contention, then rerun the Vast host verification. | +| `gpu.ram`
GPU RAM | Required: Per-GPU VRAM > 7 GiB | The verification workload requires more than 7 GB of VRAM per GPU. | Use a GPU with more VRAM for this self-test. | +| `system.ram`
System RAM | Required: System RAM >= min(0.95 * total GPU VRAM, 2,000,000 MiB) | System RAM must be close to total VRAM so CPU-side staging does not starve the tests. For very large GPU hosts, this requirement is capped at about 2 TB. | Add system RAM or reduce the listed GPU set so system RAM is at least 95% of total VRAM, up to the 2 TB cap. | +| `cpu.cores`
CPU cores | Required: Physical CPU cores >= listed GPUs | The tester expects at least one physical CPU core per listed GPU. Hyperthreads/logical CPUs do not count as physical cores. | Add physical CPU cores or reduce the listed GPU count for this offer. | +| `network.direct_ports.recommended_max`
Direct port count advisory | Advisory: direct ports <= 64 * listed GPUs | Vast instances can use at most 64 open ports each. Mapping more than 64 ports per listed GPU is usually wasted effort. | This is advisory only, not a self-test gate. Keep enough direct ports for self-test and normal workloads, but avoid mapping very large unused ranges. | + +### Direct Ports And Port Mapping + +The self-test needs direct public connectivity to the temporary instance. The progress service runs inside the self-test container on `5000/tcp`, but the CLI connects to the mapped external public IP and external port reported by the instance. + +- Minimum gate: at least 3 direct ports per listed GPU. +- Useful cap: each instance can use up to 64 ports. Mapping more than 64 ports per listed GPU is usually unnecessary and is not a self-test requirement. +- Port forwarding should target the host's LAN address, not its public address. +- Keep TCP and UDP forwarding symmetric where your network setup requires both protocols. +- If the CLI reports a tested external IP:port, troubleshoot that external mapping first. +- If the host and CLI are on the same LAN, a local failure to reach the public IP can be NAT hairpinning. Retest from an outside network before assuming the port is closed globally. + + +The CLI can report the external progress port it tested when that mapping is available. A full list of exactly which direct ports failed still requires backend or daemon-side exposure. + + +### Bandwidth Formula + +Upload and download thresholds scale with total machine VRAM: + +```text +required_mbps = min(500, max(100, 500 * total_vram_gib / 192)) +``` + +| Total machine VRAM | Required upload and download | +| --- | --- | +| 8 GiB total VRAM | 100 Mb/s | +| 48 GiB total VRAM | 125 Mb/s | +| 80 GiB total VRAM | 208.33 Mb/s | +| 96 GiB total VRAM | 250 Mb/s | +| 160 GiB total VRAM | 416.67 Mb/s | +| 192 GiB total VRAM or more | 500 Mb/s | + +## Self-test Image Selection + +The CLI selects from the self-test image family unless `--test-image` or `VAST_SELF_TEST_IMAGE` overrides the image for dogfood testing. + +| CLI image | Torch | Targets | Platforms | +| --- | --- | --- | --- | +| `vastai/test:self-test-v2-cuda-11.8` | 2.7.1 | Pre-sm_70 (Maxwell, Pascal); older R450+ drivers | linux/amd64 | +| `vastai/test:self-test-v2-cuda-12.8` | 2.10.0 | sm_70+ (Volta and newer); current default | linux/amd64, linux/arm64 | +| `vastai/test:self-test-v2-cuda-13.0` | 2.11.0 | sm_75+ (Turing and newer); cu130 wheels never shipped sm_70 | linux/amd64, linux/arm64 | +| `vastai/test:self-test-v2-cuda-13.3` | 2.12.0 | sm_75+ (Turing and newer); CUDA 13.3 runtime with latest available CUDA-13 PyTorch wheels (`cu132`) | linux/amd64, linux/arm64 | + +Selection rules: + +- Pre-Volta GPUs (`compute_cap < 700`) use the CUDA 11.8 image. +- Volta GPUs (`compute_cap < 750`) are capped at CUDA 12.8 because newer PyTorch CUDA 13 wheels do not include sm_70 support. +- Other hosts use the newest supported self-test image that is less than or equal to `cuda_max_good`. + +## Runtime Stages + +After preflight passes, the CLI starts the self-test image and polls the runtime progress service on container port `5000/tcp`. + +| Stage | Pass condition / threshold | Purpose | Failure guidance | +| --- | --- | --- | --- | +| `image_started`
Self-test image started | The runtime container starts and writes the first progress event. | Confirm the self-test runtime started and can report progress. | If this is the last event, inspect container startup logs and Python import errors. | +| `system_requirements`
System requirements | Each GPU has at least 98% free VRAM; system RAM is at least 95% of total GPU VRAM capped at 2,000,000 MiB; physical CPU cores are at least the visible GPU count. | Verify GPU visibility, available VRAM, host RAM, and CPU core capacity. | Check CUDA visibility, host memory, CPU allocation, and competing GPU workloads. | +| `resnet`
ResNet GPU execution | A CUDA ResNet18 workload completes on the visible GPU set at any tested batch size. | Run a CUDA ResNet18 workload across visible GPUs. | Check PyTorch CUDA compatibility, GPU memory pressure, and driver health. | +| `ecc`
ECC memory allocation | The test allocates 95% of total memory on each visible GPU. | Allocate most GPU memory on each visible GPU to surface memory/ECC failures. | Check GPU ECC/Xid errors, available VRAM, and device health. | +| `nccl`
NCCL distributed communication | At least 1 GPU is visible and all NCCL ranks initialize and synchronize on one machine. | Initialize NCCL workers across all visible GPUs and synchronize them. | Check NCCL support, peer communication, CUDA driver/runtime compatibility, and process limits. | +| `stress_gpu_burn`
CPU stress and GPU burn | stress-ng and gpu-burn run together for 60 seconds and both exit with code 0. | Run stress-ng and gpu-burn together to exercise sustained host and GPU load. | Check thermals, power limits, gpu-burn output, stress-ng output, and system stability. | +| `final_summary`
Final summary | The runtime reports the overall pass/fail result and exit code. | Report the overall self-test result. | Review the failed stage event and raw output tails for the first actionable failure. | + +## Offer Selection And Preflight Issues + +The CLI reports stable preflight failure codes and, when possible, a likely root state for machines that are not currently rentable. + +### Not Found Or Not Rentable + +The old `not found or not rentable` wording hid several different states. The newer CLI tries to disambiguate the state before giving guidance. + +Typical root causes: + +- The machine is currently rented. +- The machine is visible but has zero active on-demand offers. +- The machine is offline, unlisted, or not visible to the API account. +- The machine is deverified, below the reliability threshold, or has offer-side error metadata. +- The API key can authenticate but does not have permission to inspect the required host or offer state. + +Useful inspection command: + +```bash +vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any' +``` + +| Code or root state | Meaning | Guidance | +| --- | --- | --- | +| `no_offer` | No on-demand offer found for the machine. | Confirm the machine ID, host online/listed state, and visible offers. | +| `no_rentable_offer` | Offers were visible, but none were currently rentable. | Wait for rentals/state refreshes or inspect host offer state. | +| `api_permission_failed` | The API key could not inspect the required machine/offer state. | Use an API key/account with host machine and offer visibility. | +| `preflight_requirements_failed` | One or more minimum requirement checks failed before renting. | Resolve the failed checks or rerun with --ignore-requirements for dogfood only. | +| `currently_rented` | Visible offers exist and one or more are already rented. | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. | +| `deverified_or_below_threshold` | Visible offers exist but host reliability, verification state, vericode, or error metadata points to a host quality gate. | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. | +| `zero_active_offers` | The machine is visible, but no active on-demand offers are listed for it. | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. | +| `offline_or_not_listed` | The machine is not visible to the account or appears offline/not listed. | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. | +| `unknown_no_rentable_offer` | Visible offers exist, but the payload does not expose a specific non-rentable reason. | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. | + +## Runtime Failure Codes + +Runtime failure codes are stable identifiers intended for CLI output, support workflows, and host-facing guidance. + +### No Response Or Progress Timeout + +A `no response` or progress timeout means the CLI could not get usable progress from the temporary self-test instance after it was created. This is usually a connectivity or startup problem, not a generic verification decision. + +Common causes: + +- Router or firewall forwards the external port to the wrong LAN IP. +- The external TCP port is closed, blocked, or not hairpin-accessible from the CLI's network. +- The self-test container never started, crashed, or did not bind the progress service. +- Docker, NVIDIA runtime, or the host daemon stalled during startup. +- The GPU or system hung under load before progress could be reported. +- Upload/network instability prevented progress responses from reaching the CLI. + +First checks: + +- Look at the failure code and the tested external IP:port in CLI output when present. +- Confirm the router/firewall forwards that external port to the host machine. +- Inspect the diagnostic bundle for `instance/container.log`, `instance/daemon.log`, and `instance/show-instance.json` when the instance existed. +- If you ran the CLI from the same LAN as the host, retry from a different network to rule out NAT loopback/hairpinning. + +| Code | Area | Meaning | Remediation | Suggested steps | +| --- | --- | --- | --- | --- | +| `instance_create_failed` | Launch, network, or cleanup | Failed to create the runtime test instance. | Check the offer, docker image, and instance creation response. | Retry with --debugging enabled.
Inspect the create-instance API error. | +| `instance_create_missing_contract` | Launch, network, or cleanup | Instance creation did not return a new contract id. | Treat the create response as malformed or incomplete. | Inspect the raw create-instance response.
Retry after confirming the offer is still rentable. | +| `instance_status_error` | Launch, network, or cleanup | The instance reported an error while starting. | Inspect the instance status message and host/container logs. | Run show instance for the contract.
Check docker logs on the host if available. | +| `instance_status_poll_failed` | Launch, network, or cleanup | Failed to poll instance status. | Confirm API connectivity and retry the status check. | Retry the CLI command.
Check network/API errors from the status request. | +| `instance_start_timeout` | Launch, network, or cleanup | The instance did not reach running state before timeout. | Check host capacity, docker startup, and network configuration. | Inspect instance status_msg.
Try the test again after confirming the host is healthy. | +| `instance_offline_before_test` | Launch, network, or cleanup | The instance went offline before the runtime test could run. | Investigate host availability and instance lifecycle events. | Check machine status.
Review host daemon and container logs. | +| `missing_public_ip` | Launch, network, or cleanup | The running instance did not expose a public IP address. | Confirm the instance network configuration and public IP assignment. | Inspect show instance output.
Retry on a machine with public networking available. | +| `progress_port_not_mapped` | Launch, network, or cleanup | The runtime progress port was not mapped. | Confirm port 5000/tcp is exposed and direct ports are available. | Check the available mapped ports in the diagnostic output.
Verify the machine has enough direct ports. | +| `progress_endpoint_unreachable` | Launch, network, or cleanup | The runtime progress endpoint was never reachable. | Check TCP firewall/NAT forwarding, direct port mapping, container startup, and NAT hairpinning. | Confirm the mapped public TCP port forwards to the host LAN IP.
Inspect docker logs to confirm the progress server bound port 5000/tcp.
If testing from the same LAN as the host, retry from an outside network to rule out NAT loopback/hairpinning. | +| `progress_endpoint_lost` | Launch, network, or cleanup | The runtime progress endpoint became unreachable after connecting. | Look for container crashes, OOM, GPU errors, or host instability. | Inspect docker logs for a crash or missing progress server.
Check dmesg for Xid, GPU reset, OOM, or host stall messages.
Check for network loss between the CLI and host public endpoint. | +| `progress_empty_timeout` | Launch, network, or cleanup | The progress endpoint returned no new output before timeout. | Check whether the runtime script stalled or stopped writing progress. | Inspect runtime logs.
Retry with debugging enabled. | +| `runtime_test_timeout` | Launch, network, or cleanup | The runtime test did not complete before timeout. | Investigate long-running or stalled test stages. | Check the last reported progress stage.
Run individual tests to isolate the stall. | +| `legacy_progress_error` | Runtime test | The legacy runtime progress stream reported an unclassified error. | Use the raw error text and active stage to decide the next action. | Inspect the original ERROR line.
Retry with debugging enabled if the cause is unclear. | +| `docker_pull_failed` | Image or container startup | The test image could not be pulled. | Check image name, tag availability, registry access, and credentials. | Verify the docker image tag exists.
Check for registry unauthorized or denied errors. | +| `daemon_startup_failed` | Image or container startup | The container or daemon failed during startup. | Inspect docker daemon, OCI runtime, and container startup logs. | Check docker logs.
Verify NVIDIA container runtime and host daemon health. | +| `nvml_failed` | Runtime test | NVML or nvidia-smi failed during system checks. | Check NVIDIA driver, NVML, and GPU visibility on the host. | Run nvidia-smi on the host.
Check driver/library mismatch or GPU reset errors. | +| `resnet_failed` | Runtime test | The ResNet runtime test failed. | Check PyTorch/CUDA health, available VRAM, and GPU compute stability. | Look for CUDA OOM, cuDNN, or torch runtime errors.
Run a smaller isolated torch workload. | +| `ecc_failed` | Runtime test | The ECC runtime test failed. | Check GPU ECC counters and hardware health. | Inspect ECC error counters.
Review dmesg and nvidia-smi health output. | +| `nccl_failed` | Runtime test | The NCCL distributed runtime test failed. | Check multi-GPU connectivity, NCCL transport, and network fabric. | Inspect NCCL error output.
Verify peer-to-peer and multi-GPU communication. | +| `stress_gpu_burn_failed` | Runtime test | The stress-ng or gpu-burn runtime test failed. | Check thermals, power stability, GPU Xid errors, and host stress logs. | Inspect dmesg for Xid errors.
Review gpu-burn and stress-ng output. | +| `interrupted` | Launch, network, or cleanup | The runtime test was interrupted. | Ensure cleanup completed or destroy the test instance manually. | Check whether the test instance still exists.
Destroy leaked instances if needed. | +| `cleanup_failed` | Launch, network, or cleanup | Runtime test cleanup failed. | Destroy the temporary test instance manually to avoid continued billing. | Run destroy instance for the temporary contract.
Retry cleanup after checking API connectivity. | + +## Diagnostic Bundles + +When a self-test fails, the CLI builds a redacted diagnostic tarball unless bundle creation is disabled. + +- Default output directory: `/tmp`. +- Disable automatic bundles with `--no-support-bundle` or `VAST_SELF_TEST_SUPPORT_BUNDLE=0`. +- Choose another directory with `--support-bundle-dir `. +- Create a manual CLI-visible bundle with `vastai dump-logs `. +- Include local host OS/kaalia artifacts only when running on the actual host with `vastai dump-logs --include-local-host-artifacts`. + +Default self-test bundles include `self-test-output.log`, `self-test-result.json`, `manifest.json`, and `collection-errors.json`. Runtime failures with a created instance can also include `instance/show-instance.json`, `instance/container.log`, and `instance/daemon.log` from the Vast instance logs API. + + +When the CLI is run from a laptop or other third-party machine, it cannot collect host-local files such as `/var/lib/vastai_kaalia/kaalia.log*`, `dmesg`, `journalctl`, `/etc/docker/daemon.json`, or `/proc/mounts` from the Vast host. Those artifacts require running the helper on the actual host or adding a future daemon/backend log-collection feature. + + +Text artifacts are capped at 262,144 bytes and log artifacts are capped at 262,144 bytes. Obvious API keys, tokens, passwords, and related secrets are redacted, but hosts should still review the tarball before sharing it with support. diff --git a/host/verification-stages.mdx b/host/verification-stages.mdx index 7976ce0..5094069 100644 --- a/host/verification-stages.mdx +++ b/host/verification-stages.mdx @@ -58,7 +58,7 @@ In order to be listed on Vast.ai, the machine must follow these minimum guidelin - Dedicated machines only - the machine shouldn't be doing other stuff while rented - Fast, reliable internet: at least 10Mbps per machine. - 10-series Nvidia GPU or MI25 or newer Radeon Instinct series GPU or Radeon VII or Radeon Pro VII or Radeon RX 7900 (GRE/XT/XTX); or Radeon Pro W7900/W7800. Other 6000 series or newer Radeon RX/Pro W series GPUs may be supported; but may not be searchable using standard filters for AMD ROCm. -- At least 1 physical CPU core (2 hyperthreads) per GPU. +- At least 1 physical CPU core per GPU. Hyperthreads/logical CPUs do not count as physical cores, and CPUs without hyperthreading are fine. - Your CPU must support AVX instruction set (not all lower end CPUs support this). - At least 4 GB of system RAM per GPU. - Fast SSD storage with at least 128GB per GPU. @@ -73,12 +73,13 @@ In order to be listed on Vast.ai, the machine must follow these minimum guidelin In order for your unverified machine to be verified, it must also meet the following minimum requirements: ```text Text -- CUDA version greater than or equal to 12.0 +- CUDA version greater than or equal to 11.8 for the self-test image family - Reliability of 90% -- At least 3 open ports per GPU (100 recommended) +- At least 3 direct open ports per GPU. Each instance can use up to 64 ports, so mapping more than 64 ports per listed GPU is usually unnecessary - Internet Download speed must scale with total machine VRAM at ~2.6 Mbps per GiB, with a 100 Mbps floor and 500 Mbps ceiling - Internet Upload speed must scale with total machine VRAM at ~2.6 Mbps per GiB, with a 100 Mbps floor and 500 Mbps ceiling - GPU RAM of 7 GB +- System RAM close to total GPU VRAM, capped around 2 TB for very high-VRAM machines - Passing the Self-Test ``` > **Note:** High-end GPUs are more likely to be verified. Machines with datacenter GPUs such as B200, H200, H100, A100, etc., and those with premium GPUs such as RTX PRO 6000 WS, 8xRTX 5090, 8xRTX 4090, etc., receive prioritized verification processing due to their high demand and performance capabilities. diff --git a/package.json b/package.json index 3973d95..bce3ad6 100644 --- a/package.json +++ b/package.json @@ -2,6 +2,7 @@ "scripts": { "build-openapi": "python3 api-reference/openapi/build.py", "check-openapi": "mint openapi-check api-reference/openapi.yaml", + "generate-self-test-reference": "python3 scripts/generate_self_test_reference.py", "dev": "mint dev" }, "dependencies": { diff --git a/scripts/generate_self_test_reference.py b/scripts/generate_self_test_reference.py new file mode 100644 index 0000000..f405145 --- /dev/null +++ b/scripts/generate_self_test_reference.py @@ -0,0 +1,714 @@ +#!/usr/bin/env python3 +"""Generate the host self-test reference page from self-test source code. + +The generator intentionally reads the current Vast CLI diagnostics and +self-test image metadata instead of duplicating threshold and remediation copy +by hand in docs. Pass explicit ``--vast-cli`` and ``--self-test`` paths when +generating from source branches that are not checked out next to this docs repo. +""" + +from __future__ import annotations + +import argparse +import ast +import html +import os +import re +import subprocess +import sys +from pathlib import Path +from typing import Any + + +DOCS_ROOT = Path(__file__).resolve().parents[1] +WORKSPACE_ROOT = DOCS_ROOT.parent + + +def existing_default(*candidates: Path) -> Path: + for candidate in candidates: + if candidate.exists(): + return candidate + return candidates[-1] + + +DEFAULT_VAST_CLI = existing_default( + WORKSPACE_ROOT / "vast-cli", + WORKSPACE_ROOT / "vast-cli-con1510-p1", +) +DEFAULT_SELF_TEST = WORKSPACE_ROOT / "self-test" +DEFAULT_OUTPUT = DOCS_ROOT / "host" / "self-test-reference.mdx" + + +def run_git(repo: Path, *args: str) -> str | None: + try: + return subprocess.check_output( + ["git", "-C", str(repo), *args], + text=True, + stderr=subprocess.DEVNULL, + ).strip() + except (subprocess.CalledProcessError, FileNotFoundError): + return None + + +def repo_ref(repo: Path) -> dict[str, str]: + remote = ( + run_git(repo, "remote", "get-url", "upstream") + or run_git(repo, "remote", "get-url", "origin") + or repo.name + ) + if "vast-ai/vast-cli" in remote or "jjziets/vast-python" in remote: + label = "vast-ai/vast-cli" + elif "vast-ai/self-test" in remote or "jjziets/self-test" in remote: + label = "vast-ai/self-test" + else: + label = repo.name + branch = run_git(repo, "branch", "--show-current") or "" + if not branch: + containing_refs = run_git(repo, "branch", "-r", "--contains", "HEAD", "--format=%(refname:short)") or "" + refs = [ref.strip() for ref in containing_refs.splitlines() if ref.strip()] + preferred_refs = ("upstream/master", "origin/master", "upstream/main", "origin/main") + branch = next((ref for ref in preferred_refs if ref in refs), refs[0] if refs else "unknown") + commit = run_git(repo, "rev-parse", "--short", "HEAD") or "unknown" + status = run_git(repo, "status", "--porcelain") or "" + return { + "label": label, + "branch": branch, + "commit": commit, + "dirty": "yes" if status else "no", + } + + +def literal_assignment(path: Path, name: str) -> Any: + module = ast.parse(path.read_text(), filename=str(path)) + for node in module.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == name: + return ast.literal_eval(node.value) + if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name) and node.target.id == name: + return ast.literal_eval(node.value) + raise ValueError(f"{name} not found in {path}") + + +def first_regex(text: str, pattern: str, default: str) -> str: + match = re.search(pattern, text) + return match.group(1) if match else default + + +def parse_cli_image_config(vast_cli: Path) -> dict[str, Any]: + machines_py = vast_cli / "vastai" / "cli" / "commands" / "machines.py" + text = machines_py.read_text() + repo = first_regex(text, r'docker_repo\s*=\s*"([^"]+)"', "vastai/test") + prefix = first_regex(text, r'image_tag_prefix\s*=\s*"([^"]+)"', "self-test-v2-cuda") + versions = [] + for left, right in re.findall(r'"(\d+\.\d+)"\s*:\s*image_for\("(\d+\.\d+)"\)', text): + if left == right: + versions.append(left) + return { + "repo": repo, + "prefix": prefix, + "versions": versions or ["11.8", "12.8", "13.0", "13.3"], + } + + +def parse_self_test_image_catalog(self_test: Path) -> dict[str, dict[str, str]]: + readme = self_test / "README.md" + catalog: dict[str, dict[str, str]] = {} + pattern = re.compile( + r"^\| `vastai/test:self-test-cuda-(\d+\.\d+)` \| ([^|]+) \| ([^|]+) \| ([^|]+) \|$" + ) + for line in readme.read_text().splitlines(): + match = pattern.match(line.strip()) + if not match: + continue + version, torch_version, targets, platforms = match.groups() + catalog[version] = { + "torch": torch_version.strip(), + "targets": targets.strip(), + "platforms": platforms.strip(), + } + return catalog + + +def cell(value: Any) -> str: + text = "" if value is None else str(value) + text = normalize_text(text) + text = html.escape(text, quote=False) + text = text.replace("\n", "
") + text = text.replace("|", "\\|") + return text + + +def normalize_text(text: str) -> str: + return ( + text.replace("machine_id=example", "machine_id=MACHINE_ID") + .replace("machine_id=", "machine_id=MACHINE_ID") + .replace("1 listed GPU(s)", "the listed GPU count") + .replace("Machine ID", "Machine ID") + .strip() + ) + + +def bullet_lines(items: list[str]) -> list[str]: + return [f"- {item}" for item in items] + + +def code(value: str) -> str: + return f"`{value}`" + + +def preflight_threshold(check: dict[str, Any], system_ram_cap_mib: int) -> str: + check_id = check["id"] + if check_id == "cuda.version": + return "CUDA version >= 11.8" + if check_id == "reliability": + return "Reliability > 0.90" + if check_id == "network.direct_ports": + return "Direct ports >= 3 * listed GPUs" + if check_id == "pcie.bandwidth": + return "PCIe bandwidth > 2.85 GB/s" + if check_id == "network.download": + return "Download >= min(500, max(100, 500 * total_vram_gib / 192)) Mb/s" + if check_id == "network.upload": + return "Upload >= min(500, max(100, 500 * total_vram_gib / 192)) Mb/s" + if check_id == "gpu.ram": + return "Per-GPU VRAM > 7 GiB" + if check_id == "system.ram": + return f"System RAM >= min(0.95 * total GPU VRAM, {system_ram_cap_mib:,} MiB)" + if check_id == "cpu.cores": + return "Physical CPU cores >= listed GPUs" + if check_id == "network.direct_ports.recommended_max": + return "direct ports <= 64 * listed GPUs" + return f"{check.get('operator', '')} {check.get('required', '')} {check.get('unit', '')}".strip() + + +def preflight_purpose(check: dict[str, Any]) -> str: + if check["id"] == "cpu.cores": + return ( + "The tester expects at least one physical CPU core per listed GPU. " + "Hyperthreads/logical CPUs do not count as physical cores." + ) + return check["purpose"] + + +def preflight_remediation(check: dict[str, Any]) -> str: + if check["id"] == "cpu.cores": + return "Add physical CPU cores or reduce the listed GPU count for this offer." + return check["remediation"] + + +def runtime_thresholds(system_ram_cap_mib: int) -> dict[str, str]: + return { + "image_started": "The runtime container starts and writes the first progress event.", + "system_requirements": ( + "Each GPU has at least 98% free VRAM; system RAM is at least " + f"95% of total GPU VRAM capped at {system_ram_cap_mib:,} MiB; " + "physical CPU cores are at least the visible GPU count." + ), + "resnet": "A CUDA ResNet18 workload completes on the visible GPU set at any tested batch size.", + "ecc": "The test allocates 95% of total memory on each visible GPU.", + "nccl": "At least 1 GPU is visible and all NCCL ranks initialize and synchronize on one machine.", + "stress_gpu_burn": "stress-ng and gpu-burn run together for 60 seconds and both exit with code 0.", + "final_summary": "The runtime reports the overall pass/fail result and exit code.", + } + + +NO_OFFER_ROOT_STATE_COPY = { + "currently_rented": "Visible offers exist and one or more are already rented.", + "deverified_or_below_threshold": "Visible offers exist but host reliability, verification state, vericode, or error metadata points to a host quality gate.", + "api_permission_failed": "The API key or account could not read the machine or offer state required by self-test.", + "zero_active_offers": "The machine is visible, but no active on-demand offers are listed for it.", + "offline_or_not_listed": "The machine is not visible to the account or appears offline/not listed.", + "unknown_no_rentable_offer": "Visible offers exist, but the payload does not expose a specific non-rentable reason.", +} + + +PREFLIGHT_FAILURE_CODES = [ + ( + "no_offer", + "No on-demand offer found for the machine.", + "Confirm the machine ID, host online/listed state, and visible offers.", + ), + ( + "no_rentable_offer", + "Offers were visible, but none were currently rentable.", + "Wait for rentals/state refreshes or inspect host offer state.", + ), + ( + "api_permission_failed", + "The API key could not inspect the required machine/offer state.", + "Use an API key/account with host machine and offer visibility.", + ), + ( + "preflight_requirements_failed", + "One or more minimum requirement checks failed before renting.", + "Resolve the failed checks or rerun with --ignore-requirements for dogfood only.", + ), +] + + +def failure_area(code_value: str) -> str: + if code_value.startswith("instance_") or code_value in { + "missing_public_ip", + "progress_port_not_mapped", + "progress_endpoint_unreachable", + "progress_endpoint_lost", + "progress_empty_timeout", + "runtime_test_timeout", + "interrupted", + "cleanup_failed", + }: + return "Launch, network, or cleanup" + if code_value in {"docker_pull_failed", "daemon_startup_failed"}: + return "Image or container startup" + if code_value in { + "nvml_failed", + "resnet_failed", + "ecc_failed", + "nccl_failed", + "stress_gpu_burn_failed", + "legacy_progress_error", + }: + return "Runtime test" + return "Runtime" + + +def load_cli_metadata(vast_cli: Path) -> dict[str, Any]: + sys.path.insert(0, str(vast_cli)) + try: + from vastai.cli.self_test.machine_diagnostics import ( # type: ignore + NO_OFFER_ROOT_STATES, + SYSTEM_RAM_REQUIREMENT_CAP_MIB, + preflight_requirement_checks, + ) + from vastai.cli.self_test.runtime_diagnostics import failure_catalog # type: ignore + from vastai.cli.self_test.support_bundle import ( # type: ignore + DEFAULT_BUNDLE_DIR, + MAX_LOG_BYTES, + MAX_TEXT_BYTES, + ) + from vastai.cli.util import required_inet_mbps # type: ignore + finally: + try: + sys.path.remove(str(vast_cli)) + except ValueError: + pass + + sample_offer = { + "id": 1, + "machine_id": "example", + "gpu_name": "RTX_4090", + "num_gpus": 1, + "dph_total": 0.1, + "dlperf": 100, + "cuda_max_good": 13.3, + "compute_cap": 890, + "reliability": 0.99, + "direct_port_count": 100, + "pcie_bw": 16.0, + "inet_down": 500, + "inet_up": 500, + "gpu_ram": 24 * 1024, + "gpu_total_ram": 24 * 1024, + "cpu_ram": 64 * 1024, + "cpu_cores": 8, + "rentable": True, + "rented": False, + "verification": "verified", + } + + return { + "preflight_checks": preflight_requirement_checks(sample_offer), + "failure_catalog": failure_catalog(), + "no_offer_root_states": list(NO_OFFER_ROOT_STATES), + "system_ram_cap_mib": SYSTEM_RAM_REQUIREMENT_CAP_MIB, + "support_bundle": { + "default_bundle_dir": DEFAULT_BUNDLE_DIR, + "max_text_bytes": MAX_TEXT_BYTES, + "max_log_bytes": MAX_LOG_BYTES, + }, + "bandwidth_examples": [ + ("8 GiB total VRAM", required_inet_mbps(8 * 1024)), + ("48 GiB total VRAM", required_inet_mbps(48 * 1024)), + ("80 GiB total VRAM", required_inet_mbps(80 * 1024)), + ("96 GiB total VRAM", required_inet_mbps(96 * 1024)), + ("160 GiB total VRAM", required_inet_mbps(160 * 1024)), + ("192 GiB total VRAM or more", required_inet_mbps(192 * 1024)), + ], + } + + +def render_preflight_table(checks: list[dict[str, Any]], system_ram_cap_mib: int) -> str: + lines = [ + "| Check | Gate | Purpose | Host guidance |", + "| --- | --- | --- | --- |", + ] + for check in checks: + status = "Advisory" if check.get("status") == "info" else "Required" + gate = f"{status}: {preflight_threshold(check, system_ram_cap_mib)}" + lines.append( + "| " + + " | ".join( + [ + f"{code(check['id'])}
{cell(check['title'])}", + cell(gate), + cell(preflight_purpose(check)), + cell(preflight_remediation(check)), + ] + ) + + " |" + ) + return "\n".join(lines) + + +def render_bandwidth_examples(examples: list[tuple[str, float]]) -> str: + lines = ["| Total machine VRAM | Required upload and download |", "| --- | --- |"] + for label, mbps in examples: + value = f"{mbps:.2f}".rstrip("0").rstrip(".") + lines.append(f"| {cell(label)} | {cell(value + ' Mb/s')} |") + return "\n".join(lines) + + +def render_runtime_stage_table(event_catalog: dict[str, dict[str, Any]], thresholds: dict[str, str]) -> str: + lines = [ + "| Stage | Pass condition / threshold | Purpose | Failure guidance |", + "| --- | --- | --- | --- |", + ] + for stage, entry in event_catalog.items(): + remediation = " ".join(entry.get("remediation") or []) + lines.append( + "| " + + " | ".join( + [ + f"{code(stage)}
{cell(entry['title'])}", + cell(thresholds.get(stage, "")), + cell(entry["purpose"]), + cell(remediation), + ] + ) + + " |" + ) + return "\n".join(lines) + + +def render_image_table(image_config: dict[str, Any], image_catalog: dict[str, dict[str, str]]) -> str: + lines = [ + "| CLI image | Torch | Targets | Platforms |", + "| --- | --- | --- | --- |", + ] + for version in image_config["versions"]: + details = image_catalog.get(version, {}) + image = f"{image_config['repo']}:{image_config['prefix']}-{version}" + lines.append( + "| " + + " | ".join( + [ + code(image), + cell(details.get("torch", "")), + cell(details.get("targets", "")), + cell(details.get("platforms", "")), + ] + ) + + " |" + ) + return "\n".join(lines) + + +def render_preflight_failure_table(root_states: list[str]) -> str: + displayed_codes = {row[0] for row in PREFLIGHT_FAILURE_CODES} + lines = [ + "| Code or root state | Meaning | Guidance |", + "| --- | --- | --- |", + ] + for code_value, summary, guidance in PREFLIGHT_FAILURE_CODES: + lines.append(f"| {code(code_value)} | {cell(summary)} | {cell(guidance)} |") + for root_state in root_states: + if root_state in displayed_codes: + continue + lines.append( + f"| {code(root_state)} | {cell(NO_OFFER_ROOT_STATE_COPY.get(root_state, 'Root state reported by offer diagnostics.'))} | Inspect all visible offers with `vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'`. |" + ) + return "\n".join(lines) + + +def render_runtime_failure_table(catalog: dict[str, dict[str, Any]]) -> str: + lines = [ + "| Code | Area | Meaning | Remediation | Suggested steps |", + "| --- | --- | --- | --- | --- |", + ] + for code_value, entry in catalog.items(): + suggested = "
".join(cell(step) for step in entry.get("suggested_steps") or []) + lines.append( + "| " + + " | ".join( + [ + code(code_value), + cell(failure_area(code_value)), + cell(entry["summary"]), + cell(entry["remediation"]), + suggested, + ] + ) + + " |" + ) + return "\n".join(lines) + + +def render_result_interpretation() -> str: + return "\n".join( + [ + "## How To Read The Result", + "", + "Self-test output has two distinct parts: preflight qualification checks and the runtime workload.", + "", + "| Result | What it means | What to do next |", + "| --- | --- | --- |", + "| Normal pass | Minimum requirements passed and the runtime workload passed. The machine is eligible for verification, subject to the normal platform verification process. | Keep the host stable and listed. Verification is still automated and not guaranteed immediately. |", + "| Normal preflight failure | The CLI found one or more requirement failures before renting a temporary instance. | Fix the measured values shown in the CLI, then rerun without `--ignore-requirements`. |", + "| Runtime failure | The CLI rented a temporary instance, started the self-test image, and a runtime stage failed or timed out. | Use the failure code, last runtime stage, and diagnostic bundle to identify the failing subsystem. |", + "| Pass with `--ignore-requirements` | The runtime workload passed, but minimum requirement checks were skipped. This does not qualify the machine for verification. | Treat this as dogfood/runtime validation only. Rerun without `--ignore-requirements` to see qualification status. |", + "", + "", + "If you use `--ignore-requirements`, still review any requirement diagnostics from a normal run. A runtime pass proves the container workload can run; it does not prove the machine meets the verification gate.", + "", + ] + ) + + +def render_ports_guidance() -> str: + return "\n".join( + [ + "### Direct Ports And Port Mapping", + "", + "The self-test needs direct public connectivity to the temporary instance. The progress service runs inside the self-test container on `5000/tcp`, but the CLI connects to the mapped external public IP and external port reported by the instance.", + "", + "- Minimum gate: at least 3 direct ports per listed GPU.", + "- Useful cap: each instance can use up to 64 ports. Mapping more than 64 ports per listed GPU is usually unnecessary and is not a self-test requirement.", + "- Port forwarding should target the host's LAN address, not its public address.", + "- Keep TCP and UDP forwarding symmetric where your network setup requires both protocols.", + "- If the CLI reports a tested external IP:port, troubleshoot that external mapping first.", + "- If the host and CLI are on the same LAN, a local failure to reach the public IP can be NAT hairpinning. Retest from an outside network before assuming the port is closed globally.", + "", + "", + "The CLI can report the external progress port it tested when that mapping is available. A full list of exactly which direct ports failed still requires backend or daemon-side exposure.", + "", + ] + ) + + +def render_no_response_guidance() -> str: + return "\n".join( + [ + "### No Response Or Progress Timeout", + "", + "A `no response` or progress timeout means the CLI could not get usable progress from the temporary self-test instance after it was created. This is usually a connectivity or startup problem, not a generic verification decision.", + "", + "Common causes:", + "", + "- Router or firewall forwards the external port to the wrong LAN IP.", + "- The external TCP port is closed, blocked, or not hairpin-accessible from the CLI's network.", + "- The self-test container never started, crashed, or did not bind the progress service.", + "- Docker, NVIDIA runtime, or the host daemon stalled during startup.", + "- The GPU or system hung under load before progress could be reported.", + "- Upload/network instability prevented progress responses from reaching the CLI.", + "", + "First checks:", + "", + "- Look at the failure code and the tested external IP:port in CLI output when present.", + "- Confirm the router/firewall forwards that external port to the host machine.", + "- Inspect the diagnostic bundle for `instance/container.log`, `instance/daemon.log`, and `instance/show-instance.json` when the instance existed.", + "- If you ran the CLI from the same LAN as the host, retry from a different network to rule out NAT loopback/hairpinning.", + ] + ) + + +def render_not_rentable_guidance() -> str: + return "\n".join( + [ + "### Not Found Or Not Rentable", + "", + "The old `not found or not rentable` wording hid several different states. The newer CLI tries to disambiguate the state before giving guidance.", + "", + "Typical root causes:", + "", + "- The machine is currently rented.", + "- The machine is visible but has zero active on-demand offers.", + "- The machine is offline, unlisted, or not visible to the API account.", + "- The machine is deverified, below the reliability threshold, or has offer-side error metadata.", + "- The API key can authenticate but does not have permission to inspect the required host or offer state.", + "", + "Useful inspection command:", + "", + "```bash", + "vastai search offers 'machine_id=MACHINE_ID rentable=any rented=any'", + "```", + ] + ) + + +def render_bundle_boundary(support: dict[str, Any]) -> str: + return "\n".join( + [ + "## Diagnostic Bundles", + "", + "When a self-test fails, the CLI builds a redacted diagnostic tarball unless bundle creation is disabled.", + "", + "- Default output directory: `" + support["default_bundle_dir"] + "`.", + "- Disable automatic bundles with `--no-support-bundle` or `VAST_SELF_TEST_SUPPORT_BUNDLE=0`.", + "- Choose another directory with `--support-bundle-dir `.", + "- Create a manual CLI-visible bundle with `vastai dump-logs `.", + "- Include local host OS/kaalia artifacts only when running on the actual host with `vastai dump-logs --include-local-host-artifacts`.", + "", + "Default self-test bundles include `self-test-output.log`, `self-test-result.json`, `manifest.json`, and `collection-errors.json`. Runtime failures with a created instance can also include `instance/show-instance.json`, `instance/container.log`, and `instance/daemon.log` from the Vast instance logs API.", + "", + "", + "When the CLI is run from a laptop or other third-party machine, it cannot collect host-local files such as `/var/lib/vastai_kaalia/kaalia.log*`, `dmesg`, `journalctl`, `/etc/docker/daemon.json`, or `/proc/mounts` from the Vast host. Those artifacts require running the helper on the actual host or adding a future daemon/backend log-collection feature.", + "", + "", + f"Text artifacts are capped at {support['max_text_bytes']:,} bytes and log artifacts are capped at {support['max_log_bytes']:,} bytes. Obvious API keys, tokens, passwords, and related secrets are redacted, but hosts should still review the tarball before sharing it with support.", + ] + ) + + +def render_page(vast_cli: Path, self_test: Path) -> str: + cli = load_cli_metadata(vast_cli) + event_catalog = literal_assignment(self_test / "remote.py", "EVENT_CATALOG") + image_config = parse_cli_image_config(vast_cli) + image_catalog = parse_self_test_image_catalog(self_test) + system_ram_cap_mib = cli["system_ram_cap_mib"] + support = cli["support_bundle"] + + vast_cli_ref = repo_ref(vast_cli) + self_test_ref = repo_ref(self_test) + + lines = [ + "---", + 'title: "Verification / Self-test Reference"', + 'sidebarTitle: "Self-test Reference"', + 'description: "Generated reference for host self-test checks, thresholds, failure codes, and guidance."', + '"canonical": "/host/self-test-reference"', + "---", + "", + "{/*", + " This page is generated by scripts/generate_self_test_reference.py.", + " Do not edit this file by hand; update the Vast CLI/self-test source metadata, then regenerate.", + f" Source: {vast_cli_ref['label']} {vast_cli_ref['branch']}@{vast_cli_ref['commit']} dirty={vast_cli_ref['dirty']}.", + f" Source: {self_test_ref['label']} {self_test_ref['branch']}@{self_test_ref['commit']} dirty={self_test_ref['dirty']}.", + "*/}", + "", + "The host self-test is the quickest way to check whether a listed machine can pass Vast.ai's minimum verification gate and run the runtime workload used by the tester.", + "", + "When you run `vastai self-test machine `, the CLI selects a rentable offer for that machine, checks minimum requirements, rents one temporary diagnostic instance, starts the self-test image, polls the runtime progress endpoint, reports the result, and destroys the temporary instance.", + "", + "", + "Passing this self-test makes a machine eligible for verification, but it does not guarantee that the machine will be verified immediately. Verification also depends on ongoing health, reliability, supply and demand, and platform policy.", + "", + "", + "", + "`--ignore-requirements` is for dogfooding only. A pass with requirement checks ignored does not qualify the machine for verification.", + "", + "", + render_result_interpretation(), + "", + "## Preflight Checks", + "", + "These checks run before the CLI rents the temporary self-test instance. Failed required checks stop the normal flow before billing starts.", + "", + render_preflight_table(cli["preflight_checks"], system_ram_cap_mib), + "", + render_ports_guidance(), + "", + "### Bandwidth Formula", + "", + "Upload and download thresholds scale with total machine VRAM:", + "", + "```text", + "required_mbps = min(500, max(100, 500 * total_vram_gib / 192))", + "```", + "", + render_bandwidth_examples(cli["bandwidth_examples"]), + "", + "## Self-test Image Selection", + "", + "The CLI selects from the self-test image family unless `--test-image` or `VAST_SELF_TEST_IMAGE` overrides the image for dogfood testing.", + "", + render_image_table(image_config, image_catalog), + "", + "Selection rules:", + "", + "- Pre-Volta GPUs (`compute_cap < 700`) use the CUDA 11.8 image.", + "- Volta GPUs (`compute_cap < 750`) are capped at CUDA 12.8 because newer PyTorch CUDA 13 wheels do not include sm_70 support.", + "- Other hosts use the newest supported self-test image that is less than or equal to `cuda_max_good`.", + "", + "## Runtime Stages", + "", + "After preflight passes, the CLI starts the self-test image and polls the runtime progress service on container port `5000/tcp`.", + "", + render_runtime_stage_table(event_catalog, runtime_thresholds(system_ram_cap_mib)), + "", + "## Offer Selection And Preflight Issues", + "", + "The CLI reports stable preflight failure codes and, when possible, a likely root state for machines that are not currently rentable.", + "", + render_not_rentable_guidance(), + "", + render_preflight_failure_table(cli["no_offer_root_states"]), + "", + "## Runtime Failure Codes", + "", + "Runtime failure codes are stable identifiers intended for CLI output, support workflows, and host-facing guidance.", + "", + render_no_response_guidance(), + "", + render_runtime_failure_table(cli["failure_catalog"]), + "", + render_bundle_boundary(support), + "", + ] + + return "\n".join(lines) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--vast-cli", + default=os.environ.get("VAST_CLI_REPO", str(DEFAULT_VAST_CLI)), + type=Path, + help="Path to a vast-ai/vast-cli checkout or PR worktree.", + ) + parser.add_argument( + "--self-test", + default=os.environ.get("VAST_SELF_TEST_REPO", str(DEFAULT_SELF_TEST)), + type=Path, + help="Path to a vast-ai/self-test checkout.", + ) + parser.add_argument( + "--output", + default=str(DEFAULT_OUTPUT), + type=Path, + help="MDX file to write.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + vast_cli = args.vast_cli.resolve() + self_test = args.self_test.resolve() + output = args.output.resolve() + + if not (vast_cli / "vastai" / "cli" / "self_test").exists(): + raise SystemExit(f"vast-cli self-test diagnostics not found: {vast_cli}") + if not (self_test / "remote.py").exists(): + raise SystemExit(f"self-test remote.py not found: {self_test}") + + page = render_page(vast_cli, self_test) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(page) + print(f"Wrote {output}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())