diff --git a/.github/buildomat/phd-run-with-args.sh b/.github/buildomat/phd-run-with-args.sh index d7b6bb463..bc23bc85a 100755 --- a/.github/buildomat/phd-run-with-args.sh +++ b/.github/buildomat/phd-run-with-args.sh @@ -54,6 +54,7 @@ args=( '--artifact-toml-path' $artifacts '--tmp-directory' $tmpdir '--artifact-directory' $artifactdir + '--parallelism' 2 $@ ) @@ -64,8 +65,20 @@ set +e failcount=$? set -e +# Disable errexit again because we may try collecting logs from runners that ran +# no tests (in which case *.log doesn't expand and tar will fail to find a file +# by that literal name) +set +e tar -czvf /tmp/phd-tmp-files.tar.gz \ - -C /tmp/propolis-phd /tmp/propolis-phd/*.log + -C $tmpdir $tmpdir/*.log +for runnerdir in $tmpdir/runner-*; do + if [ -d "$runnerdir" ]; then + tar -rzvf /tmp/phd-tmp-files.tar.gz \ + -C $runnerdir $runnerdir/*.log + fi +done +set -e + exitcode=0 if [ $failcount -eq 0 ]; then diff --git a/Cargo.lock b/Cargo.lock index b2203c19d..17183cc9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,6 +777,7 @@ dependencies = [ "propolis_types", "proptest", "thiserror 1.0.64", + "uuid", ] [[package]] @@ -3998,9 +3999,12 @@ version = "0.1.0" dependencies = [ "anyhow", "backtrace", + "bhyve_api 0.0.0", "camino", "cargo_metadata", "clap", + "crossbeam-channel", + "libc", "phd-framework", "phd-tests", "tokio", diff --git a/crates/bhyve-api/src/lib.rs b/crates/bhyve-api/src/lib.rs index df4e32e28..e59af13fd 100644 --- a/crates/bhyve-api/src/lib.rs +++ b/crates/bhyve-api/src/lib.rs @@ -125,6 +125,7 @@ impl AsRawFd for VmmCtlFd { } } +#[derive(Debug)] pub enum ReservoirError { /// Resizing operation was interrupted, but if a non-zero chunk size was /// specified, one or more chunk-sized adjustments to the reservoir size may diff --git a/crates/cpuid-utils/Cargo.toml b/crates/cpuid-utils/Cargo.toml index 096eb5bbc..1563ba5df 100644 --- a/crates/cpuid-utils/Cargo.toml +++ b/crates/cpuid-utils/Cargo.toml @@ -10,6 +10,7 @@ bhyve_api.workspace = true propolis_api_types = {workspace = true, optional = true} propolis_types.workspace = true thiserror.workspace = true +uuid.workspace = true [dev-dependencies] proptest.workspace = true diff --git a/crates/cpuid-utils/src/host.rs b/crates/cpuid-utils/src/host.rs index 539c03b3d..65ca95087 100644 --- a/crates/cpuid-utils/src/host.rs +++ b/crates/cpuid-utils/src/host.rs @@ -5,6 +5,7 @@ use bhyve_api::{VmmCtlFd, VmmFd}; use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor}; use thiserror::Error; +use uuid::Uuid; use crate::{ bits::{ @@ -32,7 +33,8 @@ struct Vm(bhyve_api::VmmFd); impl Vm { fn new() -> Result { - let name = format!("cpuid-gen-{}", std::process::id()); + let name = + format!("cpuid-gen-{}-{}", std::process::id(), Uuid::new_v4()); let mut req = bhyve_api::vm_create_req::new(name.as_bytes()) .expect("valid VM name"); diff --git a/phd-tests/runner/Cargo.toml b/phd-tests/runner/Cargo.toml index 57054e88f..a07024ed3 100644 --- a/phd-tests/runner/Cargo.toml +++ b/phd-tests/runner/Cargo.toml @@ -12,8 +12,11 @@ doctest = false [dependencies] anyhow.workspace = true backtrace.workspace = true +bhyve_api.workspace = true camino.workspace = true clap = { workspace = true, features = ["derive"] } +crossbeam-channel.workspace = true +libc.workspace = true phd-framework.workspace = true phd-tests.workspace = true tokio = { workspace = true, features = ["full"] } diff --git a/phd-tests/runner/src/config.rs b/phd-tests/runner/src/config.rs index 1e152d08f..3edf351a4 100644 --- a/phd-tests/runner/src/config.rs +++ b/phd-tests/runner/src/config.rs @@ -156,6 +156,12 @@ pub struct RunOptions { #[clap(long, default_value = "file")] pub server_logging_mode: ServerLogMode, + /// The parallelism with which to run PHD tests. If not provided, phd-runner + /// will guess a reasonable number from the test environment's number of + /// CPUs and available memory. + #[clap(long, value_parser)] + pub parallelism: Option, + /// The number of CPUs to assign to the guest in tests where the test is /// using the default machine configuration. #[clap(long, value_parser, default_value = "2")] diff --git a/phd-tests/runner/src/execute.rs b/phd-tests/runner/src/execute.rs index 3126e3a31..72d5d0e5e 100644 --- a/phd-tests/runner/src/execute.rs +++ b/phd-tests/runner/src/execute.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome}; @@ -37,21 +37,9 @@ pub struct ExecutionStats { pub failed_test_cases: Vec<&'static TestCase>, } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] -enum Status { - Ran(TestOutcome), - NotRun, -} - -struct Execution { - tc: &'static TestCase, - status: Status, -} - /// Executes a set of tests using the supplied test context. pub async fn run_tests_with_ctx( - ctx: &Arc, - mut fixtures: TestFixtures, + ctx: &mut Vec<(Arc, TestFixtures)>, run_opts: &RunOptions, ) -> ExecutionStats { let mut executions = Vec::new(); @@ -60,10 +48,10 @@ pub async fn run_tests_with_ctx( &run_opts.include_filter, &run_opts.exclude_filter, ) { - executions.push(Execution { tc, status: Status::NotRun }); + executions.push(tc); } - let mut stats = ExecutionStats { + let stats = ExecutionStats { tests_passed: 0, tests_failed: 0, tests_skipped: 0, @@ -77,90 +65,128 @@ pub async fn run_tests_with_ctx( return stats; } - fixtures.execution_setup().unwrap(); - let sigint_rx = set_sigint_handler(); - info!("Running {} test(s)", executions.len()); - let start_time = Instant::now(); - for execution in &mut executions { - if *sigint_rx.borrow() { - info!("Test run interrupted by SIGINT"); - break; - } + let stats = Arc::new(Mutex::new(stats)); - info!("Starting test {}", execution.tc.fully_qualified_name()); + async fn run_tests( + execution_rx: crossbeam_channel::Receiver<&'static TestCase>, + test_ctx: Arc, + mut fixtures: TestFixtures, + stats: Arc>, + sigint_rx: watch::Receiver, + ) -> Result<(), ()> { + fixtures.execution_setup().unwrap(); - // Failure to run a setup fixture is fatal to the rest of the run, but - // it's still possible to report results, so return gracefully instead - // of panicking. - if let Err(e) = fixtures.test_setup() { - error!("Error running test setup fixture: {}", e); - break; - } + loop { + // Check for SIGINT only at the top of the loop because while + // waiting for a new testcase is theoretically a blocking + // operation, it won't be in a meaningful way for our use. The + // recv() will return immediately because either there are more + // testcases to run or the sender is closed. The only long + // blocking operation to check against in this loop is the test + // run itself. + if *sigint_rx.borrow() { + info!("Test run interrupted by SIGINT"); + break; + } - stats.tests_not_run -= 1; - let test_ctx = ctx.clone(); - let tc = execution.tc; - let mut sigint_rx_task = sigint_rx.clone(); - let test_outcome = tokio::spawn(async move { - tokio::select! { - // Ensure interrupt signals are always handled instead of - // continuing to run the test. - biased; - result = sigint_rx_task.changed() => { - assert!( - result.is_ok(), - "SIGINT channel shouldn't drop while tests are running" - ); - - TestOutcome::Failed( - Some("test interrupted by SIGINT".to_string()) - ) + let tc = match execution_rx.recv() { + Ok(tc) => tc, + Err(_) => { + // RecvError means the channel is closed, so we're all + // done. + break; } - outcome = tc.run(test_ctx.as_ref()) => outcome + }; + + info!("Starting test {}", tc.fully_qualified_name()); + + // Failure to run a setup fixture is fatal to the rest of the + // run, but it's still possible to report results, so return + // gracefully instead of panicking. + if let Err(e) = fixtures.test_setup() { + error!("Error running test setup fixture: {}", e); + // TODO: set this on stats too + break; } - }) - .await - .unwrap_or_else(|_| { - TestOutcome::Failed(Some( - "test task panicked, see test logs".to_string(), - )) - }); - - info!( - "test {} ... {}{}", - execution.tc.fully_qualified_name(), - match test_outcome { - TestOutcome::Passed => "ok", - TestOutcome::Failed(_) => "FAILED: ", - TestOutcome::Skipped(_) => "skipped: ", - }, - match &test_outcome { - TestOutcome::Failed(Some(s)) - | TestOutcome::Skipped(Some(s)) => s, - TestOutcome::Failed(None) | TestOutcome::Skipped(None) => - "[no message]", - _ => "", + + { + let mut stats = stats.lock().unwrap(); + stats.tests_not_run -= 1; } - ); - match test_outcome { - TestOutcome::Passed => stats.tests_passed += 1, - TestOutcome::Failed(_) => { - stats.tests_failed += 1; - stats.failed_test_cases.push(execution.tc); + let test_outcome = tc.run(test_ctx.as_ref()).await; + + info!( + "test {} ... {}{}", + tc.fully_qualified_name(), + match test_outcome { + TestOutcome::Passed => "ok", + TestOutcome::Failed(_) => "FAILED: ", + TestOutcome::Skipped(_) => "skipped: ", + }, + match &test_outcome { + TestOutcome::Failed(Some(s)) + | TestOutcome::Skipped(Some(s)) => s, + TestOutcome::Failed(None) | TestOutcome::Skipped(None) => + "[no message]", + _ => "", + } + ); + + { + let mut stats = stats.lock().unwrap(); + match test_outcome { + TestOutcome::Passed => stats.tests_passed += 1, + TestOutcome::Failed(_) => { + stats.tests_failed += 1; + stats.failed_test_cases.push(tc); + } + TestOutcome::Skipped(_) => stats.tests_skipped += 1, + } } - TestOutcome::Skipped(_) => stats.tests_skipped += 1, - } - execution.status = Status::Ran(test_outcome); - if let Err(e) = fixtures.test_cleanup().await { - error!("Error running cleanup fixture: {}", e); - break; + if let Err(e) = fixtures.test_cleanup().await { + error!("Error running cleanup fixture: {}", e); + // TODO: set this on stats + break; + } } + + fixtures.execution_cleanup().unwrap(); + + Ok(()) } - stats.duration = start_time.elapsed(); - fixtures.execution_cleanup().unwrap(); + let sigint_rx = set_sigint_handler(); + info!("Running {} test(s)", executions.len()); + let start_time = Instant::now(); + + let (execution_tx, execution_rx) = + crossbeam_channel::unbounded::<&'static TestCase>(); + + let mut test_runners = tokio::task::JoinSet::new(); + + for (ctx, fixtures) in ctx.drain(..) { + test_runners.spawn(run_tests( + execution_rx.clone(), + ctx, + fixtures, + Arc::clone(&stats), + sigint_rx.clone(), + )); + } + + for execution in &mut executions { + execution_tx.send(execution).expect("ok"); + } + std::mem::drop(execution_tx); + + let _ = test_runners.join_all().await; + + let mut stats = + Mutex::into_inner(Arc::into_inner(stats).expect("only one ref")) + .expect("lock not panicked"); + stats.duration = start_time.elapsed(); stats } diff --git a/phd-tests/runner/src/main.rs b/phd-tests/runner/src/main.rs index 225f5a45b..e7590fa5d 100644 --- a/phd-tests/runner/src/main.rs +++ b/phd-tests/runner/src/main.rs @@ -6,6 +6,7 @@ mod config; mod execute; mod fixtures; +use anyhow::{bail, Context}; use clap::Parser; use config::{ListOptions, ProcessArgs, RunOptions}; use phd_tests::phd_testcase::{Framework, FrameworkParameters}; @@ -46,36 +47,233 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +fn guess_max_reasonable_parallelism( + default_guest_cpus: u8, + default_guest_memory_mib: u64, +) -> anyhow::Result { + // Assume no test starts more than 3 VMs. This is a really conservative + // guess to make sure we don't cause tests to fail simply because we ran + // too many at once. + const MAX_VMS_GUESS: u64 = 3; + let cpus_per_runner = default_guest_cpus as u64 * MAX_VMS_GUESS; + let memory_mib_per_runner = + (default_guest_memory_mib * MAX_VMS_GUESS) as usize; + + /// Miniscule wrapper for `sysconf(3C)` calls that checks errors. + fn sysconf(cfg: i32) -> std::io::Result { + // Safety: sysconf is an FFI call but we don't change any system + // state, it won't cause unwinding, etc. + let res = unsafe { libc::sysconf(cfg) }; + // For the handful of variables that can be queried, the variable is + // defined and won't error. Technically if the variable is + // unsupported, `-1` is returned without changing `errno`. In such + // cases, returning errno might be misleading! + // + // Instead of trying to disambiguate this, and in the knowledge + // these calls as we make them should never fail, just fall back to + // a more general always-factually-correct. + if res == -1 { + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + format!("could not get sysconf({})", cfg), + )); + } + Ok(res) + } + + let online_cpus = sysconf(libc::_SC_NPROCESSORS_ONLN) + .expect("can get number of online processors") + as u64; + // We're assuming that the system running tests is relatively idle other + // than the test runner itself. Overprovisioning CPUs will make everyone + // sad but should not fail tests, at least... + let lim_by_cpus = online_cpus / cpus_per_runner; + info!( + "parallelism by cpu count: {} ({} / {})", + lim_by_cpus, online_cpus, cpus_per_runner + ); + + let ctl = bhyve_api::VmmCtlFd::open()?; + let reservoir = + ctl.reservoir_query().context("failed to query reservoir")?; + let mut vmm_mem_limit = reservoir.vrq_free_sz; + + // The reservoir will be 0MiB by default if the system has not been + // configured with a particular size. + if reservoir.vrq_alloc_sz == 0 { + // If the reservoir is not configured, we'll try to make do with + // system memory and implore someone to earmark memory for test VMs in + // the future. + let page_size: usize = sysconf(libc::_SC_PAGESIZE) + .expect("can get page size") + .try_into() + .expect("page size is reasonable"); + let total_pages: usize = sysconf(libc::_SC_PHYS_PAGES) + .expect("can get physical pages in the system") + .try_into() + .expect("physical page count is reasonable"); + + const MB: usize = 1024 * 1024; + + let installed_mb = page_size * total_pages / MB; + // /!\ Arbitrary choice warning /!\ + // + // It would be a little rude to spawn so many VMs that we cause the + // system running tests to empty the whole ARC and swap. If there's no + // reservior, though, we're gonna use *some* amount of memory that isn't + // explicitly earmarked for bhyve, though. 1/4th is just a "feels ok" + // fraction. + vmm_mem_limit = installed_mb / 4; + + warn!( + "phd-runner sees the VMM reservior is unconfigured, and will use \ + up to 25% of system memory ({}MiB) for test VMs. Please consider \ + using `cargo run --bin rsrvrctl set ` to set aside \ + memory for test VMs.", + vmm_mem_limit + ); + } + + let lim_by_mem = vmm_mem_limit / memory_mib_per_runner; + info!( + "parallelism by memory: {} ({} / {})", + lim_by_mem, vmm_mem_limit, memory_mib_per_runner + ); + + Ok(std::cmp::min(lim_by_cpus as u16, lim_by_mem as u16)) +} + async fn run_tests(run_opts: &RunOptions) -> anyhow::Result { - let ctx_params = FrameworkParameters { - propolis_server_path: run_opts.propolis_server_cmd.clone(), - crucible_downstairs: run_opts.crucible_downstairs()?, - base_propolis: run_opts.base_propolis(), - tmp_directory: run_opts.tmp_directory.clone(), - artifact_directory: run_opts.artifact_directory(), - artifact_toml: run_opts.artifact_toml_path.clone(), - server_log_mode: run_opts.server_logging_mode, - default_guest_cpus: run_opts.default_guest_cpus, - default_guest_memory_mib: run_opts.default_guest_memory_mib, - default_guest_os_artifact: run_opts.default_guest_artifact.clone(), - default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(), - port_range: 9000..10000, - max_buildomat_wait: Duration::from_secs( - run_opts.max_buildomat_wait_secs, - ), + let parallelism = if let Some(parallelism) = run_opts.parallelism { + if parallelism == 0 { + bail!("Parallelism of 0 was requested; cannot run tests under these conditions!"); + } + parallelism + } else { + let res = guess_max_reasonable_parallelism( + run_opts.default_guest_cpus, + run_opts.default_guest_memory_mib, + )?; + if res == 0 { + bail!( + "Inferred a parallelism of 0; this is probably because there \ + is not much available memory for test VMs? Consider checking \ + reservoir configuration." + ); + } + res }; - let ctx = Arc::new( - Framework::new(ctx_params) - .await - .expect("should be able to set up a test context"), - ); + info!("running tests with max parallelism of {}", parallelism); + + // /!\ Arbitrary choice warning /!\ + // + // We probably only need a half dozen ports at most for any test. 200 is an + // incredible overallocation to never have to worry about the problem. As + // long as we don't have hundreds of test runners running concurrently. + const PORT_RANGE_PER_RUNNER: u16 = 125; + + let mut runners = Vec::new(); + + if parallelism == 1 { + // If there's only one test runner, the provided config is by definition + // not going to have issues with parallelism. Consume it directly. + let ctx_params = FrameworkParameters { + propolis_server_path: run_opts.propolis_server_cmd.clone(), + crucible_downstairs: run_opts.crucible_downstairs()?, + base_propolis: run_opts.base_propolis(), + tmp_directory: run_opts.tmp_directory.clone(), + artifact_directory: run_opts.artifact_directory(), + artifact_toml: run_opts.artifact_toml_path.clone(), + server_log_mode: run_opts.server_logging_mode, + default_guest_cpus: run_opts.default_guest_cpus, + default_guest_memory_mib: run_opts.default_guest_memory_mib, + default_guest_os_artifact: run_opts.default_guest_artifact.clone(), + default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(), + port_range: 9000..(9000 + PORT_RANGE_PER_RUNNER), + max_buildomat_wait: Duration::from_secs( + run_opts.max_buildomat_wait_secs, + ), + }; + + let ctx = Arc::new( + Framework::new(ctx_params) + .await + .expect("should be able to set up a test context"), + ); + + let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + runners.push((ctx, fixtures)); + } else { + // Create up to parallelism collections of PHD framework settings. Many + // settings can be reused directly, but some describe state the + // framework will mutate (causing ports to be allocated, fetching + // artifacts, etc). Take the provided config and make it safe for + // Frameworks to use independently. + // + // This implies downloading artifacts {parallelism} times, extracting + // artifacts redundantly for the different artifact directories, makes + // terrible use of the port range. It'd be much better to accept some + // owner of the shared state that each Framework can take as a + // parameter. + + // We'll separate directories by which Framework they're for in some + // cases, below. Those uniquifying names will be padded to all be the + // length of the longest uniquifier to help make it kind of legible.. + let pad_size = format!("{}", parallelism - 1).len(); + + for i in 0..parallelism { + let port_range = (9000 + PORT_RANGE_PER_RUNNER * i) + ..(9000 + PORT_RANGE_PER_RUNNER * (i + 1)); + let mut unshared_tmp_dir = run_opts.tmp_directory.clone(); + unshared_tmp_dir.push(format!("runner-{:pad_size$}", i)); + + let mut unshared_artifacts_dir = + run_opts.artifact_directory().clone(); + unshared_artifacts_dir.push(format!("runner-{:pad_size$}", i)); + + // The runner directories may well not exist, we just invented them + // after all. Create them so phd-runner can use them. This is awful. + // Sorry. + let _ = std::fs::create_dir(&unshared_tmp_dir); + let _ = std::fs::create_dir(&unshared_artifacts_dir); - let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + let ctx_params = FrameworkParameters { + propolis_server_path: run_opts.propolis_server_cmd.clone(), + crucible_downstairs: run_opts.crucible_downstairs()?, + base_propolis: run_opts.base_propolis(), + tmp_directory: unshared_tmp_dir, + artifact_directory: unshared_artifacts_dir, + artifact_toml: run_opts.artifact_toml_path.clone(), + server_log_mode: run_opts.server_logging_mode, + default_guest_cpus: run_opts.default_guest_cpus, + default_guest_memory_mib: run_opts.default_guest_memory_mib, + default_guest_os_artifact: run_opts + .default_guest_artifact + .clone(), + default_bootrom_artifact: run_opts + .default_bootrom_artifact + .clone(), + port_range, + max_buildomat_wait: Duration::from_secs( + run_opts.max_buildomat_wait_secs, + ), + }; + + let ctx = Arc::new( + Framework::new(ctx_params) + .await + .expect("should be able to set up a test context"), + ); + + let fixtures = TestFixtures::new(ctx.clone()).unwrap(); + runners.push((ctx, fixtures)); + } + } // Run the tests and print results. let execution_stats = - execute::run_tests_with_ctx(&ctx, fixtures, run_opts).await; + execute::run_tests_with_ctx(&mut runners, run_opts).await; if !execution_stats.failed_test_cases.is_empty() { println!("\nfailures:"); for tc in &execution_stats.failed_test_cases {