test(ab): generalize --test to --pytest-opts

roypat · roypat · commit 316d9554da08 · 2025-04-25T10:49:02.000Z
Allow passing arbitrary pytest options through to the ab-testing script,
so that things like `-k` can be used for test selection.

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/.buildkite/pipeline_perf.py b/.buildkite/pipeline_perf.py
@@ -18,41 +18,41 @@
 perf_test = {
     "virtio-block": {
         "label": "💿 Virtio Block Performance",
-        "test_path": "integration_tests/performance/test_block_ab.py::test_block_performance",
+        "tests": "integration_tests/performance/test_block_ab.py::test_block_performance",
         "devtool_opts": "-c 1-10 -m 0",
     },
     "vhost-user-block": {
         "label": "💿 vhost-user Block Performance",
-        "test_path": "integration_tests/performance/test_block_ab.py::test_block_vhost_user_performance",
+        "tests": "integration_tests/performance/test_block_ab.py::test_block_vhost_user_performance",
         "devtool_opts": "-c 1-10 -m 0",
         "ab_opts": "--noise-threshold 0.1",
     },
     "network": {
         "label": "📠 Network Latency and Throughput",
-        "test_path": "integration_tests/performance/test_network_ab.py",
+        "tests": "integration_tests/performance/test_network_ab.py",
         "devtool_opts": "-c 1-10 -m 0",
         # Triggers if delta is > 0.01ms (10µs) or default relative threshold (5%)
         # only relevant for latency test, throughput test will always be magnitudes above this anyway
         "ab_opts": "--absolute-strength 0.010",
     },
     "snapshot-latency": {
         "label": "📸 Snapshot Latency",
-        "test_path": "integration_tests/performance/test_snapshot_ab.py::test_restore_latency integration_tests/performance/test_snapshot_ab.py::test_post_restore_latency",
+        "tests": "integration_tests/performance/test_snapshot_ab.py::test_restore_latency integration_tests/performance/test_snapshot_ab.py::test_post_restore_latency",
         "devtool_opts": "-c 1-12 -m 0",
     },
     "population-latency": {
         "label": "📸 Memory Population Latency",
-        "test_path": "integration_tests/performance/test_snapshot_ab.py::test_population_latency",
+        "tests": "integration_tests/performance/test_snapshot_ab.py::test_population_latency",
         "devtool_opts": "-c 1-12 -m 0",
     },
     "vsock-throughput": {
         "label": "🧦 Vsock Throughput",
-        "test_path": "integration_tests/performance/test_vsock_ab.py",
+        "tests": "integration_tests/performance/test_vsock_ab.py",
         "devtool_opts": "-c 1-10 -m 0",
     },
     "memory-overhead": {
         "label": "💾 Memory Overhead and 👢 Boottime",
-        "test_path": "integration_tests/performance/test_memory_overhead.py integration_tests/performance/test_boottime.py::test_boottime",
+        "tests": "integration_tests/performance/test_memory_overhead.py integration_tests/performance/test_boottime.py::test_boottime",
         "devtool_opts": "-c 1-10 -m 0",
     },
 }
@@ -93,23 +93,21 @@
 tests = [perf_test[test] for test in pipeline.args.test or perf_test.keys()]
 for test in tests:
     devtool_opts = test.pop("devtool_opts")
-    test_path = test.pop("test_path")
+    test_selector = test.pop("tests")
     ab_opts = test.pop("ab_opts", "")
     devtool_opts += " --performance"
-    pytest_opts = ""
+    test_script_opts = ""
     if REVISION_A:
         devtool_opts += " --ab"
-        pytest_opts = (
-            f"{ab_opts} run build/{REVISION_A}/ build/{REVISION_B} --test {test_path}"
-        )
+        test_script_opts = f'{ab_opts} run build/{REVISION_A}/ build/{REVISION_B} --pytest-opts "{test_selector}"'
     else:
         # Passing `-m ''` below instructs pytest to collect tests regardless of
         # their markers (e.g. it will collect both tests marked as nonci, and
         # tests without any markers).
-        pytest_opts += f" -m '' {test_path}"
+        test_script_opts += f" -m '' {test_selector}"
 
     pipeline.build_group(
-        command=pipeline.devtool_test(devtool_opts, pytest_opts),
+        command=pipeline.devtool_test(devtool_opts, test_script_opts),
         # and the rest can be command arguments
         **test,
     )
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -174,19 +174,20 @@ def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
     return post_processed_emf
 
 
-def collect_data(binary_dir: Path, tests: list[str]):
+def collect_data(binary_dir: Path, pytest_opts: str):
     """Executes the specified test using the provided firecracker binaries"""
     binary_dir = binary_dir.resolve()
 
     print(f"Collecting samples with {binary_dir}")
     subprocess.run(
-        ["./tools/test.sh", f"--binary-dir={binary_dir}", *tests, "-m", ""],
+        f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m ''",
         env=os.environ
         | {
             "AWS_EMF_ENVIRONMENT": "local",
             "AWS_EMF_NAMESPACE": "local",
         },
         check=True,
+        shell=True,
     )
     return load_data_series(
         Path("test_results/test-report.json"), binary_dir, reemit=True
@@ -330,15 +331,15 @@ def analyze_data(
 def ab_performance_test(
     a_revision: Path,
     b_revision: Path,
-    tests,
+    pytest_opts,
     p_thresh,
     strength_abs_thresh,
     noise_threshold,
 ):
     """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
     return binary_ab_test(
-        lambda bin_dir, _: collect_data(bin_dir, tests),
+        lambda bin_dir, _: collect_data(bin_dir, pytest_opts),
         lambda ah, be: analyze_data(
             ah,
             be,
@@ -371,7 +372,11 @@ def ab_performance_test(
         help="Directory containing firecracker and jailer binaries whose performance we want to compare against the results from a_revision",
         type=Path,
     )
-    run_parser.add_argument("--test", help="The test to run", nargs="+", required=True)
+    run_parser.add_argument(
+        "--pytest-opts",
+        help="Parameters to pass through to pytest, for example for test selection",
+        required=True,
+    )
     analyze_parser = subparsers.add_parser(
         "analyze",
         help="Analyze the results of two manually ran tests based on their test-report.json files",
@@ -410,7 +415,7 @@ def ab_performance_test(
         ab_performance_test(
             args.a_revision,
             args.b_revision,
-            args.test,
+            args.pytest_opts,
             args.significance,
             args.absolute_strength,
             args.noise_threshold,