From b187d5c4080d72de696c93902c2a5006c5e1e03b Mon Sep 17 00:00:00 2001
From: Hans Halverson <hans_halverson@alumni.brown.edu>
Date: Thu, 18 Jun 2026 15:58:46 -0700
Subject: [PATCH] [tests] Third-party benchmark test suite that runs Octane,
 JetStream, and Web Tooling Benchmark

---
 .github/workflows/ci.yml             |   7 +-
 Cargo.lock                           |  10 ++
 Cargo.toml                           |   1 +
 src/benches/README.md                |   2 +-
 tests/perf/.gitignore                |   1 +
 tests/perf/Cargo.toml                |  18 ++
 tests/perf/README.md                 |  58 +++++++
 tests/perf/install.sh                |  67 +++++++
 tests/perf/shims/octane.js           |  12 ++
 tests/perf/src/main.rs               | 249 +++++++++++++++++++++++++++
 tests/perf/src/report.rs             | 144 ++++++++++++++++
 tests/perf/src/runner.rs             | 148 ++++++++++++++++
 tests/perf/src/suite.rs              | 126 ++++++++++++++
 tests/perf/src/suites/jetstream.rs   | 217 +++++++++++++++++++++++
 tests/perf/src/suites/mod.rs         |   3 +
 tests/perf/src/suites/octane.rs      | 145 ++++++++++++++++
 tests/perf/src/suites/web_tooling.rs | 145 ++++++++++++++++
 17 files changed, 1350 insertions(+), 3 deletions(-)
 create mode 100644 tests/perf/.gitignore
 create mode 100644 tests/perf/Cargo.toml
 create mode 100644 tests/perf/README.md
 create mode 100755 tests/perf/install.sh
 create mode 100644 tests/perf/shims/octane.js
 create mode 100644 tests/perf/src/main.rs
 create mode 100644 tests/perf/src/report.rs
 create mode 100644 tests/perf/src/runner.rs
 create mode 100644 tests/perf/src/suite.rs
 create mode 100644 tests/perf/src/suites/jetstream.rs
 create mode 100644 tests/perf/src/suites/mod.rs
 create mode 100644 tests/perf/src/suites/octane.rs
 create mode 100644 tests/perf/src/suites/web_tooling.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eea90326..77cf29bb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -118,7 +118,7 @@ jobs:
       - name: Run integration tests on nightly without alloc_error feature enabled
         run: cargo brimstone-test --release --no-default-features --features nightly -- --reindex --ignore-unimplemented
     
-  build-benchmarks:
+  build-perf:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
@@ -126,7 +126,10 @@ jobs:
 
       - uses: Swatinem/rust-cache@82a92a6e8fbeee089604da2575dc567ae9ddeaab # v2.7.5
 
-      - name: Build benchmarks
+      - name: Build performance test suite
+        run: cargo build -p brimstone_perf
+
+      - name: Build benchmarks tests
         run: cargo bench --no-run
 
   build-fuzzer:
diff --git a/Cargo.lock b/Cargo.lock
index ab14f883..08b32688 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -132,6 +132,16 @@ dependencies = [
  "syn 2.0.100",
 ]
 
+[[package]]
+name = "brimstone_perf"
+version = "0.1.0"
+dependencies = [
+ "clap",
+ "regex",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "brimstone_serialized_heap"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index e1695d3b..964466ae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
   "tests",
   "tests/fuzz",
   "tests/harness",
+  "tests/perf",
 ]
 resolver = "2"
 
diff --git a/src/benches/README.md b/src/benches/README.md
index 5e4e78c4..e85929e3 100644
--- a/src/benches/README.md
+++ b/src/benches/README.md
@@ -1,6 +1,6 @@
 # Benchmarks
 
-Brimstone's performance testing is found in this directory.
+Brimstone's first party performance microbenchmark testing is found in this directory.
 
 ## Installation
 
diff --git a/tests/perf/.gitignore b/tests/perf/.gitignore
new file mode 100644
index 00000000..22d0d82f
--- /dev/null
+++ b/tests/perf/.gitignore
@@ -0,0 +1 @@
+vendor
diff --git a/tests/perf/Cargo.toml b/tests/perf/Cargo.toml
new file mode 100644
index 00000000..9bc35e71
--- /dev/null
+++ b/tests/perf/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "brimstone_perf"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+
+[[bin]]
+name = "bs-perf"
+path = "src/main.rs"
+
+[dependencies]
+clap = { workspace = true, features = ["derive"] }
+regex.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+
+[lints]
+workspace = true
diff --git a/tests/perf/README.md b/tests/perf/README.md
new file mode 100644
index 00000000..a6311c2e
--- /dev/null
+++ b/tests/perf/README.md
@@ -0,0 +1,58 @@
+# Performance suite
+
+Runs standard third-party JavaScript performance suites against brimstone.
+
+Supports [Octane](https://github.com/chromium/octane), [JetStream](https://github.com/WebKit/JetStream), and [Web Tooling Benchmark](https://github.com/v8/web-tooling-benchmark/).
+
+## Installation
+
+Suites are vendored into `vendor/` with `./install.sh`. Requires `git`, and `npm` for the Web Tooling Benchmark build.
+
+## Run
+
+```
+# Build bs in release mode and run Octane:
+cargo run -p brimstone_perf -- --suite octane
+
+# Run only specific benchmarks (case-insensitive substring match, repeatable):
+cargo run -p brimstone_perf -- --suite octane --bench richards --bench splay
+
+# Structured JSON, written to a file:
+cargo run -p brimstone_perf -- --suite octane --format json --out octane.json
+```
+
+Useful flags: `--bs-path <path>` (use an existing binary instead of building),
+`--vendor-dir <dir>`, `--format pretty|json`, `--out <file>`, `--flamegraph [<file>]`.
+
+## Profiling a run (flamegraph)
+
+Pass `--flamegraph` to profile the `bs` process for a run and write a flamegraph SVG,
+using the [`flamegraph`](https://github.com/flamegraph-rs/flamegraph) CLI.
+
+```
+# Initial setup: make sure flamegraph is installed
+cargo install flamegraph
+
+# Run a single benchmark with profiling and write flamegraph output
+cargo run --release -p brimstone_perf -- --suite octane --bench raytrace --flamegraph
+```
+
+Notes:
+
+- **Flamegraph arguments.** `--flamegraph-arg=<arg>` forwards any raw `flamegraph` argument
+- **Output file.** `--flamegraph=<file>` writes the output to a paricular file. Defaults to
+  `flamegraph.svg` if no file name is provided.
+- **Build with symbols.** When `--flamegraph` is set and the harness builds `bs` itself, it
+  builds release *with debug info* (`CARGO_PROFILE_RELEASE_DEBUG=true`) so frames are named.
+  If you supply your own `--bs-path`, build it with debug symbols yourself.
+
+## How it works
+
+The harness runs `bs` as a subprocess and prints results to stdout. `bs` is run with:
+
+```
+bs --expose-test-shell-compat <suite files...> <driver> [-- <forwarded cli args>]
+```
+
+`--expose-test-shell-compat` installs the shell host functions that benchmark suites
+expect. Everything after `--` is exposed as `globalThis.arguments`.
diff --git a/tests/perf/install.sh b/tests/perf/install.sh
new file mode 100755
index 00000000..74bb3fe8
--- /dev/null
+++ b/tests/perf/install.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Installs the 3p performance suites into ./vendor.
+
+set -e
+
+CURRENT_DIR=$(cd "$(dirname "$0")" && pwd)
+VENDOR_DIR="$CURRENT_DIR/vendor"
+
+# Pinned commits for each 3p suite
+OCTANE_COMMIT=570ad1ccfe86e3eecba0636c8f932ac08edec517
+JETSTREAM_COMMIT=b7babdf323e64e69bd2f6c376189c15825f5c73a
+WEB_TOOLING_COMMIT=4a12828c6a1eed02a70c011bd080445dd319a05f
+
+OCTANE_DIR="$VENDOR_DIR/octane"
+JETSTREAM_DIR="$VENDOR_DIR/jetstream"
+WEB_TOOLING_BENCHMARK_DIR="$VENDOR_DIR/web-tooling-benchmark"
+
+mkdir -p "$VENDOR_DIR"
+
+# Shallow-fetch a single pinned commit: clone_pinned <dir> <url> <commit>.
+clone_pinned() {
+  rm -rf "$1" &&
+  git init -q "$1" &&
+  git -C "$1" remote add origin "$2" &&
+  git -C "$1" fetch -q --depth 1 origin "$3" &&
+  git -C "$1" checkout -q FETCH_HEAD
+}
+
+if [ ! -d "$OCTANE_DIR/.git" ]; then
+  echo "==> Installing Octane"
+  clone_pinned "$OCTANE_DIR" https://github.com/chromium/octane "$OCTANE_COMMIT"
+else
+  echo "==> Octane already installed"
+fi
+
+if [ ! -d "$JETSTREAM_DIR/.git" ]; then
+  echo "==> Installing JetStream"
+  clone_pinned "$JETSTREAM_DIR" https://github.com/WebKit/JetStream "$JETSTREAM_COMMIT"
+else
+  echo "==> JetStream already installed"
+fi
+
+WEB_TOOLING_BENCHMARKS="acorn babel babel-minify babylon buble chai coffeescript espree \
+  esprima jshint lebab postcss prepack prettier source-map terser typescript uglify-js"
+
+if [ ! -f "$WEB_TOOLING_BENCHMARK_DIR/dist/cli-acorn.js" ]; then
+  echo "==> Installing Web Tooling Benchmark"
+  if [ ! -d "$WEB_TOOLING_BENCHMARK_DIR/.git" ]; then
+    clone_pinned "$WEB_TOOLING_BENCHMARK_DIR" https://github.com/v8/web-tooling-benchmark "$WEB_TOOLING_COMMIT"
+  fi
+  (
+    cd "$WEB_TOOLING_BENCHMARK_DIR"
+    [ -d node_modules ] || npm ci
+
+    # Build separate bundles for each individual benchmark so they can be run independently.
+    for bench in $WEB_TOOLING_BENCHMARKS; do
+      echo "    building standalone bundle for $bench"
+      npm run build -- --env.only "$bench"
+      mv dist/cli.js "dist/cli-$bench.js"
+    done
+  )
+else
+  echo "==> Web Tooling Benchmark already installed"
+fi
+
+echo "Done. Vendored suites are in $VENDOR_DIR"
diff --git a/tests/perf/shims/octane.js b/tests/perf/shims/octane.js
new file mode 100644
index 00000000..1b118b90
--- /dev/null
+++ b/tests/perf/shims/octane.js
@@ -0,0 +1,12 @@
+// Print easily parseable results from a particular run of Octane
+BenchmarkSuite.RunSuites({
+  NotifyResult: function (name, result) {
+    print("RESULT " + name + " " + result);
+  },
+  NotifyError: function (name, error) {
+    print("ERROR " + name + " " + error);
+  },
+  NotifyScore: function (score) {
+    print("SCORE " + score);
+  },
+});
diff --git a/tests/perf/src/main.rs b/tests/perf/src/main.rs
new file mode 100644
index 00000000..3afb29a9
--- /dev/null
+++ b/tests/perf/src/main.rs
@@ -0,0 +1,249 @@
+mod report;
+mod runner;
+mod suite;
+mod suites;
+
+use std::{
+    path::{Path, PathBuf},
+    process::Command,
+};
+
+use clap::{Parser, ValueEnum};
+
+use crate::report::SuiteRun;
+use crate::runner::Flamegraph;
+use crate::suite::{BenchFilter, RunContext, all_suites, find_suite};
+
+#[derive(Clone, Copy, ValueEnum)]
+enum Format {
+    Pretty,
+    Json,
+}
+
+#[derive(Parser)]
+#[command(about = "Run standard JS performance suites against the brimstone `bs` engine")]
+struct Args {
+    /// Suite(s) to run: octane, web-tooling, jetstream, or all. Repeatable.
+    #[arg(long, default_values_t = vec!["octane".to_string()])]
+    suite: Vec<String>,
+
+    /// Run only the benchmarks with this name. Repeatable.
+    #[arg(long)]
+    bench: Vec<String>,
+
+    /// Output format.
+    #[arg(long, value_enum, default_value_t = Format::Pretty)]
+    format: Format,
+
+    /// Write output to this file instead of stdout.
+    #[arg(long)]
+    out: Option<PathBuf>,
+
+    /// Path to the brimstone executable. If omitted, builds and uses a release build.
+    #[arg(long)]
+    bs_path: Option<PathBuf>,
+
+    /// Directory where suites are installed. Defaults to the crate's vendor/ directory.
+    #[arg(long)]
+    vendor_dir: Option<PathBuf>,
+
+    /// Profile each `bs` run with the `flamegraph` CLI (must be on PATH), writing an SVG to
+    /// this path (default: flamegraph.svg)
+    #[arg(long, num_args = 0..=1, default_missing_value = "flamegraph.svg")]
+    flamegraph: Option<PathBuf>,
+
+    /// Extra raw argument forwarded to the `flamegraph` CLI, before the `--` separator. Repeatable.
+    #[arg(long)]
+    flamegraph_arg: Vec<String>,
+}
+
+fn main() {
+    let args = Args::parse();
+
+    let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let workspace_root = manifest_dir
+        .parent()
+        .and_then(Path::parent)
+        .unwrap_or(manifest_dir)
+        .to_path_buf();
+
+    let shims_dir = manifest_dir.join("shims");
+    let vendor_dir = args
+        .vendor_dir
+        .clone()
+        .unwrap_or_else(|| manifest_dir.join("vendor"));
+
+    let requested = resolve_suites(&args.suite);
+    let suite_specs = match requested {
+        Ok(s) => s,
+        Err(e) => {
+            eprintln!("error: {e}");
+            std::process::exit(2);
+        }
+    };
+
+    let filter = BenchFilter::new(&args.bench);
+
+    // Validate the flamegraph CLI up front rather than failing on the first `bs` spawn.
+    let flamegraph_config = match build_flamegraph_config(&args) {
+        Ok(cfg) => cfg,
+        Err(e) => {
+            eprintln!("error: {e}");
+            std::process::exit(1);
+        }
+    };
+
+    // Locate or build bs (with debug symbols when profiling, for named frames).
+    let bs_path = match resolve_bs_path(&args, &workspace_root, flamegraph_config.is_some()) {
+        Ok(p) => p,
+        Err(e) => {
+            eprintln!("error: {e}");
+            std::process::exit(1);
+        }
+    };
+
+    let ctx = RunContext {
+        bs_path: &bs_path,
+        vendor_dir: &vendor_dir,
+        shims_dir: &shims_dir,
+        filter: &filter,
+        flamegraph: flamegraph_config.as_ref(),
+    };
+
+    let mut runs: Vec<SuiteRun> = Vec::new();
+    for spec in &suite_specs {
+        if !spec.is_available(&ctx) {
+            eprintln!(
+                "skipping {}: not installed under {} (run tests/perf/install.sh)",
+                spec.name(),
+                vendor_dir.join(spec.vendor_subdir()).display()
+            );
+            continue;
+        }
+        eprintln!("running {}...", spec.name());
+        runs.push(spec.run(&ctx));
+    }
+
+    if runs.is_empty() {
+        eprintln!("no suites ran; nothing to report");
+        std::process::exit(1);
+    }
+
+    let rendered = match args.format {
+        Format::Pretty => report::to_pretty(&runs),
+        Format::Json => report::to_json(&runs),
+    };
+
+    match &args.out {
+        Some(path) => {
+            if let Err(e) = std::fs::write(path, rendered) {
+                eprintln!("error: failed to write {}: {e}", path.display());
+                std::process::exit(1);
+            }
+            eprintln!("wrote results to {}", path.display());
+        }
+        None => println!("{rendered}"),
+    }
+}
+
+/// Resolve suite names (including "all") to specs, in order and de-duplicated.
+fn resolve_suites(names: &[String]) -> Result<Vec<Box<dyn suite::SuiteSpec>>, String> {
+    let mut selected: Vec<Box<dyn suite::SuiteSpec>> = Vec::new();
+    let mut seen: Vec<&str> = Vec::new();
+
+    let push = |spec: Box<dyn suite::SuiteSpec>,
+                seen: &mut Vec<&'static str>,
+                out: &mut Vec<Box<dyn suite::SuiteSpec>>| {
+        if !seen.contains(&spec.name()) {
+            seen.push(spec.name());
+            out.push(spec);
+        }
+    };
+
+    for name in names {
+        if name == "all" {
+            for spec in all_suites() {
+                push(spec, &mut seen, &mut selected);
+            }
+        } else {
+            match find_suite(name) {
+                Some(spec) => push(spec, &mut seen, &mut selected),
+                None => {
+                    let known: Vec<&str> = all_suites().iter().map(|s| s.name()).collect();
+                    return Err(format!(
+                        "unknown suite '{name}'; known suites: {}, all",
+                        known.join(", ")
+                    ));
+                }
+            }
+        }
+    }
+
+    Ok(selected)
+}
+
+/// Path to the `bs` binary, building it in release if needed. `with_debug_symbols` adds
+/// debug info (for readable flamegraph frames) while keeping release optimizations.
+fn resolve_bs_path(
+    args: &Args,
+    workspace_root: &Path,
+    with_debug_symbols: bool,
+) -> Result<PathBuf, String> {
+    if let Some(path) = &args.bs_path {
+        if path.is_file() {
+            if with_debug_symbols {
+                eprintln!(
+                    "note: profiling a pre-built --bs-path; for readable frames it should be \
+                     built with debug symbols (e.g. CARGO_PROFILE_RELEASE_DEBUG=true)"
+                );
+            }
+            return Ok(path.clone());
+        }
+        return Err(format!("--bs-path {} does not exist", path.display()));
+    }
+
+    let default = workspace_root.join("target/release/bs");
+
+    let mut cmd = Command::new("cargo");
+    cmd.current_dir(workspace_root)
+        .args(["build", "--release", "-p", "brimstone"]);
+    if with_debug_symbols {
+        // Keep release optimizations but emit debug info, so flamegraph frames are named.
+        eprintln!("building bs (release + debug symbols)...");
+        cmd.env("CARGO_PROFILE_RELEASE_DEBUG", "true");
+    } else {
+        eprintln!("building bs (release)...");
+    }
+    let status = cmd
+        .status()
+        .map_err(|e| format!("failed to run cargo build: {e}"))?;
+    if !status.success() {
+        return Err("cargo build --release -p brimstone failed".to_string());
+    }
+
+    if default.is_file() {
+        Ok(default)
+    } else {
+        Err(format!(
+            "bs binary not found at {}; build it or pass --bs-path",
+            default.display()
+        ))
+    }
+}
+
+fn build_flamegraph_config(args: &Args) -> Result<Option<Flamegraph>, String> {
+    let Some(output) = args.flamegraph.clone() else {
+        if !args.flamegraph_arg.is_empty() {
+            eprintln!("note: --flamegraph-arg ignored without --flamegraph");
+        }
+        return Ok(None);
+    };
+
+    if !runner::flamegraph_available() {
+        return Err("--flamegraph requires the `flamegraph` CLI on PATH; install it with \
+             `cargo install flamegraph` (it uses perf on Linux, dtrace on macOS)"
+            .to_string());
+    }
+
+    Ok(Some(Flamegraph { output, extra_args: args.flamegraph_arg.clone() }))
+}
diff --git a/tests/perf/src/report.rs b/tests/perf/src/report.rs
new file mode 100644
index 00000000..8cd9fe79
--- /dev/null
+++ b/tests/perf/src/report.rs
@@ -0,0 +1,144 @@
+use serde::Serialize;
+
+#[derive(Clone, Copy, Serialize)]
+#[serde(rename_all = "lowercase")]
+pub enum Status {
+    Ok,
+    /// Intentionally not run (e.g. needs an engine feature brimstone lacks).
+    Skipped,
+    /// Attempted but failed (crash, exception, or missing score in output).
+    Error,
+}
+
+#[derive(Clone, Serialize)]
+pub struct BenchResult {
+    pub name: String,
+    /// Score as reported by the suite. `None` for skipped/errored.
+    pub score: Option<f64>,
+    /// Suite-defined unit, e.g. "octane", "runs/s".
+    pub unit: String,
+    pub status: Status,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub note: Option<String>,
+}
+
+impl BenchResult {
+    pub fn ok(name: impl Into<String>, score: f64, unit: impl Into<String>) -> Self {
+        BenchResult {
+            name: name.into(),
+            score: Some(score),
+            unit: unit.into(),
+            status: Status::Ok,
+            note: None,
+        }
+    }
+
+    pub fn skipped(
+        name: impl Into<String>,
+        unit: impl Into<String>,
+        note: impl Into<String>,
+    ) -> Self {
+        BenchResult {
+            name: name.into(),
+            score: None,
+            unit: unit.into(),
+            status: Status::Skipped,
+            note: Some(note.into()),
+        }
+    }
+
+    pub fn error(
+        name: impl Into<String>,
+        unit: impl Into<String>,
+        note: impl Into<String>,
+    ) -> Self {
+        BenchResult {
+            name: name.into(),
+            score: None,
+            unit: unit.into(),
+            status: Status::Error,
+            note: Some(note.into()),
+        }
+    }
+}
+
+#[derive(Clone, Serialize)]
+pub struct SuiteRun {
+    pub suite: String,
+    pub bs_path: String,
+    pub wall_clock_ms: f64,
+    pub results: Vec<BenchResult>,
+    /// Overall score for the suite, as reported by the suite itself.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub summary: Option<BenchResult>,
+    #[serde(skip)]
+    pub pretty_score_precision: usize,
+}
+
+pub fn to_json(runs: &[SuiteRun]) -> String {
+    serde_json::to_string_pretty(runs).expect("suite runs are serializable")
+}
+
+/// Render the suite runs as a human-readable aligned table.
+pub fn to_pretty(runs: &[SuiteRun]) -> String {
+    let mut out = String::new();
+    for run in runs {
+        out.push_str(&format!("=== {} ===\n", run.suite));
+        out.push_str(&format!("wall clock: {:.0} ms\n\n", run.wall_clock_ms));
+
+        let name_width = run
+            .results
+            .iter()
+            .map(|r| r.name.len())
+            .chain(std::iter::once("Benchmark".len()))
+            .max()
+            .unwrap_or(9)
+            .max(9);
+
+        out.push_str(&format!(
+            "{:<width$}  {:>14}  {:<8}  {}\n",
+            "Benchmark",
+            "Score",
+            "Status",
+            "Note",
+            width = name_width
+        ));
+        out.push_str(&format!("{}\n", "-".repeat(name_width + 14 + 8 + 10)));
+
+        for r in &run.results {
+            let status = match r.status {
+                Status::Ok => "ok",
+                Status::Skipped => "skipped",
+                Status::Error => "error",
+            };
+            out.push_str(&format!(
+                "{:<width$}  {:>14}  {:<8}  {}\n",
+                r.name,
+                format_score(r.score, run.pretty_score_precision),
+                status,
+                r.note.as_deref().unwrap_or(""),
+                width = name_width
+            ));
+        }
+
+        if let Some(summary) = &run.summary {
+            out.push_str(&format!("{}\n", "-".repeat(name_width + 14 + 8 + 10)));
+            out.push_str(&format!(
+                "{:<width$}  {:>14}  ({})\n",
+                summary.name,
+                format_score(summary.score, run.pretty_score_precision),
+                summary.unit,
+                width = name_width
+            ));
+        }
+        out.push('\n');
+    }
+    out
+}
+
+fn format_score(score: Option<f64>, precision: usize) -> String {
+    match score {
+        None => "-".to_string(),
+        Some(score) => format!("{score:.precision$}"),
+    }
+}
diff --git a/tests/perf/src/runner.rs b/tests/perf/src/runner.rs
new file mode 100644
index 00000000..d13ecd90
--- /dev/null
+++ b/tests/perf/src/runner.rs
@@ -0,0 +1,148 @@
+use std::{
+    path::{Path, PathBuf},
+    process::Command,
+    sync::atomic::{AtomicU64, Ordering},
+    time::{Duration, Instant},
+};
+
+pub struct BsOutput {
+    pub stdout: String,
+    pub stderr: String,
+    pub success: bool,
+    pub wall_clock_ms: f64,
+    pub timed_out: bool,
+}
+
+/// Profiles `bs` via the [`flamegraph`](https://github.com/flamegraph-rs/flamegraph) CLI,
+/// writing an SVG to `output`.
+#[derive(Clone)]
+pub struct Flamegraph {
+    pub output: PathBuf,
+    pub extra_args: Vec<String>,
+}
+
+impl Flamegraph {
+    pub fn labeled(&self, label: &str) -> Flamegraph {
+        let sanitized: String = label
+            .chars()
+            .map(|c| {
+                if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                    c
+                } else {
+                    '-'
+                }
+            })
+            .collect();
+        let stem = self
+            .output
+            .file_stem()
+            .and_then(|s| s.to_str())
+            .unwrap_or("flamegraph");
+        let ext = self
+            .output
+            .extension()
+            .and_then(|s| s.to_str())
+            .unwrap_or("svg");
+        Flamegraph {
+            output: self
+                .output
+                .with_file_name(format!("{stem}.{sanitized}.{ext}")),
+            extra_args: self.extra_args.clone(),
+        }
+    }
+}
+
+pub fn flamegraph_available() -> bool {
+    Command::new("flamegraph")
+        .arg("--version")
+        .stdout(std::process::Stdio::null())
+        .stderr(std::process::Stdio::null())
+        .status()
+        .map(|s| s.success())
+        .unwrap_or(false)
+}
+
+pub fn run_bs(
+    bs_path: &Path,
+    flags: &[String],
+    files: &[PathBuf],
+    flamegraph: Option<&Flamegraph>,
+) -> std::io::Result<BsOutput> {
+    run_bs_with_timeout(bs_path, flags, files, &[], None, flamegraph)
+}
+
+pub fn run_bs_with_timeout(
+    bs_path: &Path,
+    flags: &[String],
+    files: &[PathBuf],
+    script_args: &[String],
+    timeout: Option<Duration>,
+    flamegraph: Option<&Flamegraph>,
+) -> std::io::Result<BsOutput> {
+    static COUNTER: AtomicU64 = AtomicU64::new(0);
+    let id = COUNTER.fetch_add(1, Ordering::Relaxed);
+    let base = std::env::temp_dir().join(format!("bs-perf-{}-{}", std::process::id(), id));
+    let out_path = base.with_extension("out");
+    let err_path = base.with_extension("err");
+
+    // `flamegraph -- bs <args>`: flamegraph forwards everything after `--` to bs verbatim,
+    // including the second `--` that hands `script_args` to bs.
+    let mut cmd = match flamegraph {
+        Some(fg) => {
+            eprintln!("    profiling -> {}", fg.output.display());
+            let mut cmd = Command::new("flamegraph");
+            cmd.arg("-o").arg(&fg.output);
+            cmd.args(&fg.extra_args);
+            cmd.arg("--");
+            cmd.arg(bs_path);
+            cmd
+        }
+        None => Command::new(bs_path),
+    };
+    cmd.args(flags);
+    for file in files {
+        cmd.arg(file);
+    }
+    if !script_args.is_empty() {
+        cmd.arg("--");
+        cmd.args(script_args);
+    }
+    cmd.stdout(std::fs::File::create(&out_path)?)
+        .stderr(std::fs::File::create(&err_path)?);
+
+    let start = Instant::now();
+    let mut child = cmd.spawn()?;
+
+    let mut timed_out = false;
+    let status = loop {
+        if let Some(status) = child.try_wait()? {
+            break status;
+        }
+        if let Some(timeout) = timeout {
+            if start.elapsed() > timeout {
+                let _ = child.kill();
+                timed_out = true;
+                break child.wait()?;
+            }
+        }
+        std::thread::sleep(Duration::from_millis(50));
+    };
+
+    let wall_clock_ms = start.elapsed().as_secs_f64() * 1000.0;
+    let stdout = std::fs::read_to_string(&out_path).unwrap_or_default();
+    let mut stderr = std::fs::read_to_string(&err_path).unwrap_or_default();
+    let _ = std::fs::remove_file(&out_path);
+    let _ = std::fs::remove_file(&err_path);
+
+    if timed_out {
+        stderr.push_str(&format!("\nkilled: exceeded {:?} timeout", timeout.unwrap()));
+    }
+
+    Ok(BsOutput {
+        stdout,
+        stderr,
+        success: status.success() && !timed_out,
+        wall_clock_ms,
+        timed_out,
+    })
+}
diff --git a/tests/perf/src/suite.rs b/tests/perf/src/suite.rs
new file mode 100644
index 00000000..dc0c5fbc
--- /dev/null
+++ b/tests/perf/src/suite.rs
@@ -0,0 +1,126 @@
+use std::path::{Path, PathBuf};
+
+use crate::report::{BenchResult, SuiteRun};
+use crate::runner::Flamegraph;
+use crate::suites;
+
+/// Case-insensitive substring filter over benchmark names; an empty filter matches all.
+#[derive(Default)]
+pub struct BenchFilter {
+    terms: Vec<String>,
+}
+
+impl BenchFilter {
+    pub fn new(terms: &[String]) -> Self {
+        BenchFilter { terms: terms.iter().map(|t| t.to_lowercase()).collect() }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.terms.is_empty()
+    }
+
+    pub fn matches(&self, name: &str) -> bool {
+        if self.is_empty() {
+            return true;
+        }
+        let name = name.to_lowercase();
+        self.terms.iter().any(|t| name.contains(t))
+    }
+}
+
+/// Inputs shared by every suite run.
+pub struct RunContext<'a> {
+    pub bs_path: &'a Path,
+    pub vendor_dir: &'a Path,
+    pub shims_dir: &'a Path,
+    /// Which benchmarks to run; empty = all.
+    pub filter: &'a BenchFilter,
+    pub flamegraph: Option<&'a Flamegraph>,
+}
+
+impl RunContext<'_> {
+    pub fn suite_dir(&self, subdir: &str) -> PathBuf {
+        self.vendor_dir.join(subdir)
+    }
+
+    pub fn shim(&self, file: &str) -> PathBuf {
+        self.shims_dir.join(file)
+    }
+
+    /// The flamegraph spec for a profiled `bs` invocation, or `None` if profiling is off.
+    /// Pass a `label` (e.g. a benchmark name) when a suite spawns `bs` more than once.
+    pub fn flamegraph(&self, label: Option<&str>) -> Option<Flamegraph> {
+        self.flamegraph.map(|base| match label {
+            Some(label) => base.labeled(label),
+            None => base.clone(),
+        })
+    }
+
+    pub fn suite_run(
+        &self,
+        suite: &impl SuiteSpec,
+        wall_clock_ms: f64,
+        results: Vec<BenchResult>,
+        summary: Option<BenchResult>,
+    ) -> SuiteRun {
+        SuiteRun {
+            suite: suite.name().to_string(),
+            bs_path: self.bs_path.display().to_string(),
+            wall_clock_ms,
+            results,
+            summary,
+            pretty_score_precision: suite.pretty_score_precision(),
+        }
+    }
+
+    pub fn single_error_run(
+        &self,
+        suite: &impl SuiteSpec,
+        unit: &str,
+        message: impl Into<String>,
+    ) -> SuiteRun {
+        self.suite_run(suite, 0.0, vec![BenchResult::error("<run>", unit, message)], None)
+    }
+}
+
+pub trait SuiteSpec {
+    fn name(&self) -> &'static str;
+
+    fn vendor_subdir(&self) -> &'static str;
+
+    fn is_available(&self, ctx: &RunContext) -> bool;
+
+    fn pretty_score_precision(&self) -> usize;
+
+    fn run(&self, ctx: &RunContext) -> SuiteRun;
+}
+
+pub fn all_suites() -> Vec<Box<dyn SuiteSpec>> {
+    vec![
+        Box::new(suites::octane::Octane),
+        Box::new(suites::web_tooling::WebTooling),
+        Box::new(suites::jetstream::JetStream),
+    ]
+}
+
+pub fn find_suite(name: &str) -> Option<Box<dyn SuiteSpec>> {
+    all_suites().into_iter().find(|s| s.name() == name)
+}
+
+/// The first non-blank line of `s` (trimmed), if any.
+pub fn first_nonempty_line(s: &str) -> Option<&str> {
+    s.lines().map(str::trim).find(|l| !l.is_empty())
+}
+
+/// Geometric mean of the scored results (the suites' own scoring), or `None` if nothing
+/// scored. Reported as a "Total" carrying the unit of the first result.
+pub fn geomean_summary(results: &[BenchResult]) -> Option<BenchResult> {
+    let scores: Vec<f64> = results.iter().filter_map(|r| r.score).collect();
+    if scores.is_empty() {
+        return None;
+    }
+    let unit = results.first().map_or("", |r| r.unit.as_str());
+    let sum_ln: f64 = scores.iter().map(|s| s.ln()).sum();
+    let geomean = (sum_ln / scores.len() as f64).exp();
+    Some(BenchResult::ok("Total", geomean, unit))
+}
diff --git a/tests/perf/src/suites/jetstream.rs b/tests/perf/src/suites/jetstream.rs
new file mode 100644
index 00000000..ecf7fca2
--- /dev/null
+++ b/tests/perf/src/suites/jetstream.rs
@@ -0,0 +1,217 @@
+use std::{path::Path, time::Duration};
+
+use serde_json::Value as Json;
+
+use crate::report::{BenchResult, SuiteRun};
+use crate::runner::run_bs_with_timeout;
+use crate::suite::{RunContext, SuiteSpec, first_nonempty_line, geomean_summary};
+
+const UNIT: &str = "jetstream";
+
+/// JetStream's shell entrypoint.
+const CLI: &str = "cli.js";
+
+/// `iteration-count` must exceed `worst-case-count`; kept modest so a full run is tractable.
+const ITERATION_COUNT: &str = "--iteration-count=8";
+const WORST_CASE_COUNT: &str = "--worst-case-count=2";
+
+const BENCH_TIMEOUT: Duration = Duration::from_secs(90);
+
+/// All individual benchmarks in this suite.
+const BENCHMARKS: &[&str] = &[
+    "8bitbench-wasm",
+    "acorn-wtb",
+    "ai-astar",
+    "Air",
+    "argon2-wasm",
+    "async-fs",
+    "babel-minify-wtb",
+    "babel-wtb",
+    "Babylon",
+    "babylon-wtb",
+    "babylonjs-scene-es6",
+    "babylonjs-startup-es6",
+    "Basic",
+    "bigint-noble-ed25519",
+    "Box2D",
+    "cdjs",
+    "chai-wtb",
+    "crypto",
+    "Dart-flute-todomvc-wasm",
+    "delta-blue",
+    "dotnet-aot-wasm",
+    "dotnet-interp-wasm",
+    "doxbee-async",
+    "doxbee-promise",
+    "earley-boyer",
+    "espree-wtb",
+    "esprima-next-wtb",
+    "first-inspector-code-load",
+    "FlightPlanner",
+    "gaussian-blur",
+    "gbemu",
+    "hash-map",
+    "j2cl-box2d-wasm",
+    "js-tokens",
+    "jsdom-d3-startup",
+    "json-parse-inspector",
+    "json-stringify-inspector",
+    "Kotlin-compose-wasm",
+    "lazy-collections",
+    "mandreel",
+    "ML",
+    "mobx-startup",
+    "multi-inspector-code-load",
+    "navier-stokes",
+    "octane-code-load",
+    "OfflineAssembler",
+    "pdfjs",
+    "postcss-wtb",
+    "prettier-wtb",
+    "prismjs-startup-es6",
+    "proxy-mobx",
+    "proxy-vue",
+    "raytrace",
+    "raytrace-private-class-fields",
+    "raytrace-public-class-fields",
+    "regexp-octane",
+    "richards",
+    "richards-wasm",
+    "source-map-wtb",
+    "splay",
+    "sqlite3-wasm",
+    "stanford-crypto-aes",
+    "stanford-crypto-pbkdf2",
+    "stanford-crypto-sha256",
+    "Sunspider",
+    "sync-fs",
+    "threejs",
+    "transformersjs-bert-wasm",
+    "tsf-wasm",
+    "typescript-lib",
+    "UniPoker",
+    "validatorjs",
+    "web-ssr",
+    "WSL",
+    "zlib-wasm",
+];
+
+pub struct JetStream;
+
+impl SuiteSpec for JetStream {
+    fn name(&self) -> &'static str {
+        "jetstream"
+    }
+
+    fn vendor_subdir(&self) -> &'static str {
+        "jetstream"
+    }
+
+    fn is_available(&self, ctx: &RunContext) -> bool {
+        ctx.suite_dir(self.vendor_subdir()).join(CLI).is_file()
+    }
+
+    fn pretty_score_precision(&self) -> usize {
+        2
+    }
+
+    fn run(&self, ctx: &RunContext) -> SuiteRun {
+        let cli = ctx.suite_dir(self.vendor_subdir()).join(CLI);
+
+        let selected: Vec<&str> = BENCHMARKS
+            .iter()
+            .copied()
+            .filter(|name| ctx.filter.matches(name))
+            .collect();
+        let mut results = Vec::new();
+        let mut wall_clock_ms = 0.0;
+        for (index, name) in selected.iter().enumerate() {
+            eprintln!("  [{}/{}] {name}", index + 1, selected.len());
+            let (result, ms) = run_one(ctx, &cli, name);
+            wall_clock_ms += ms;
+            results.push(result);
+        }
+
+        let summary = geomean_summary(&results);
+
+        ctx.suite_run(self, wall_clock_ms, results, summary)
+    }
+}
+
+fn run_one(ctx: &RunContext, cli: &Path, name: &str) -> (BenchResult, f64) {
+    let script_args = [
+        format!("--test={name}"),
+        "--no-prefetch".to_string(),
+        "--dump-json-results".to_string(),
+        "--force-gc".to_string(),
+        ITERATION_COUNT.to_string(),
+        WORST_CASE_COUNT.to_string(),
+    ];
+
+    let flamegraph = ctx.flamegraph(Some(name));
+    let output = match run_bs_with_timeout(
+        ctx.bs_path,
+        &shell_flags(),
+        &[cli.to_path_buf()],
+        &script_args,
+        Some(BENCH_TIMEOUT),
+        flamegraph.as_ref(),
+    ) {
+        Ok(output) => output,
+        Err(e) => return (BenchResult::error(name, UNIT, format!("failed to spawn bs: {e}")), 0.0),
+    };
+
+    if let Some(score) = parse_score(&output.stdout, name) {
+        return (BenchResult::ok(name, score, UNIT), output.wall_clock_ms);
+    }
+
+    if output.timed_out {
+        return (
+            BenchResult::error(name, UNIT, format!("timed out after {BENCH_TIMEOUT:?}")),
+            output.wall_clock_ms,
+        );
+    }
+
+    // WebAssembly benchmarks are skipped, anything else is a hard error.
+    let detail = failure_detail(&output.stdout, &output.stderr);
+    let result = if name.ends_with("-wasm") || detail.contains("WebAssembly") {
+        BenchResult::skipped(name, UNIT, "requires WebAssembly")
+    } else {
+        BenchResult::error(name, UNIT, detail)
+    };
+    (result, output.wall_clock_ms)
+}
+
+/// Score lives at `<JetStreamVersion>.tests.<name>.metrics.Score.current[0]`.
+fn parse_score(stdout: &str, name: &str) -> Option<f64> {
+    let line = stdout
+        .lines()
+        .map(str::trim)
+        .find(|line| line.starts_with('{') && line.contains("JetStream"))?;
+    let json: Json = serde_json::from_str(line).ok()?;
+
+    // The top-level key carries a version (e.g. "JetStream3.0"); take the first object.
+    let root = json.as_object()?.values().next()?;
+    root.get("tests")?
+        .get(name)?
+        .get("metrics")?
+        .get("Score")?
+        .get("current")?
+        .get(0)?
+        .as_f64()
+}
+
+fn failure_detail(stdout: &str, stderr: &str) -> String {
+    stdout
+        .lines()
+        .map(str::trim)
+        .find(|l| l.contains("failed:") || l.contains("Error"))
+        .or_else(|| first_nonempty_line(stderr))
+        .or_else(|| first_nonempty_line(stdout))
+        .unwrap_or("no score produced")
+        .to_string()
+}
+
+fn shell_flags() -> Vec<String> {
+    vec!["--expose-test-shell-compat".to_string()]
+}
diff --git a/tests/perf/src/suites/mod.rs b/tests/perf/src/suites/mod.rs
new file mode 100644
index 00000000..6f880491
--- /dev/null
+++ b/tests/perf/src/suites/mod.rs
@@ -0,0 +1,3 @@
+pub mod jetstream;
+pub mod octane;
+pub mod web_tooling;
diff --git a/tests/perf/src/suites/octane.rs b/tests/perf/src/suites/octane.rs
new file mode 100644
index 00000000..3e09c670
--- /dev/null
+++ b/tests/perf/src/suites/octane.rs
@@ -0,0 +1,145 @@
+use std::path::PathBuf;
+
+use crate::report::{BenchResult, SuiteRun};
+use crate::runner::run_bs;
+use crate::suite::{RunContext, SuiteSpec, first_nonempty_line};
+
+const UNIT: &str = "octane";
+
+/// Octane 2.0 benchmark groups and their source files, in load order (mirrors Octane's
+/// `run.js`). A group runs only if all its files are present in the vendored checkout.
+const BENCHMARKS: &[(&str, &[&str])] = &[
+    ("Richards", &["richards.js"]),
+    ("DeltaBlue", &["deltablue.js"]),
+    ("Crypto", &["crypto.js"]),
+    ("RayTrace", &["raytrace.js"]),
+    ("EarleyBoyer", &["earley-boyer.js"]),
+    ("RegExp", &["regexp.js"]),
+    ("Splay", &["splay.js"]),
+    ("NavierStokes", &["navier-stokes.js"]),
+    ("PdfJS", &["pdfjs.js"]),
+    ("Mandreel", &["mandreel.js"]),
+    ("Gameboy", &["gbemu-part1.js", "gbemu-part2.js"]),
+    ("CodeLoad", &["code-load.js"]),
+    ("Box2D", &["box2d.js"]),
+    ("zlib", &["zlib.js", "zlib-data.js"]),
+    (
+        "Typescript",
+        &[
+            "typescript.js",
+            "typescript-input.js",
+            "typescript-compiler.js",
+        ],
+    ),
+];
+
+pub struct Octane;
+
+impl SuiteSpec for Octane {
+    fn name(&self) -> &'static str {
+        "octane"
+    }
+
+    fn vendor_subdir(&self) -> &'static str {
+        "octane"
+    }
+
+    fn is_available(&self, ctx: &RunContext) -> bool {
+        ctx.suite_dir(self.vendor_subdir())
+            .join("base.js")
+            .is_file()
+    }
+
+    fn pretty_score_precision(&self) -> usize {
+        0
+    }
+
+    fn run(&self, ctx: &RunContext) -> SuiteRun {
+        let suite_dir = ctx.suite_dir(self.vendor_subdir());
+
+        // Loaded in order: base.js, the selected benchmark files, then the driver.
+        let mut files: Vec<PathBuf> = vec![suite_dir.join("base.js")];
+        let mut skipped: Vec<BenchResult> = Vec::new();
+        let mut loaded_groups = 0;
+
+        for (group, group_files) in BENCHMARKS {
+            if !ctx.filter.matches(group) {
+                continue;
+            }
+
+            let paths: Vec<PathBuf> = group_files.iter().map(|f| suite_dir.join(f)).collect();
+            if paths.iter().all(|p| p.is_file()) {
+                files.extend(paths);
+                loaded_groups += 1;
+            } else {
+                skipped.push(BenchResult::skipped(
+                    *group,
+                    UNIT,
+                    "benchmark source not found in vendored checkout",
+                ));
+            }
+        }
+
+        // Don't spawn bs with no benchmarks; the driver would run zero suites.
+        if loaded_groups == 0 {
+            return ctx.suite_run(self, 0.0, skipped, None);
+        }
+
+        files.push(ctx.shim("octane.js"));
+
+        let flags = vec!["--expose-test-shell-compat".to_string()];
+        let output = match run_bs(ctx.bs_path, &flags, &files, ctx.flamegraph(None).as_ref()) {
+            Ok(o) => o,
+            Err(e) => {
+                return ctx.single_error_run(self, UNIT, format!("failed to spawn bs: {e}"));
+            }
+        };
+
+        let (mut results, summary) = parse_octane_output(&output.stdout);
+
+        if summary.is_none() && !output.success {
+            let detail = first_nonempty_line(&output.stderr)
+                .unwrap_or("bs exited unsuccessfully with no score");
+            results.push(BenchResult::error("<run>", UNIT, detail.to_string()));
+        }
+
+        results.extend(skipped);
+
+        ctx.suite_run(self, output.wall_clock_ms, results, summary)
+    }
+}
+
+/// Parse the `RESULT`/`ERROR`/`SCORE` lines emitted by octane.js
+fn parse_octane_output(stdout: &str) -> (Vec<BenchResult>, Option<BenchResult>) {
+    let mut results = Vec::new();
+    let mut summary = None;
+
+    for line in stdout.lines() {
+        let mut parts = line.splitn(3, ' ');
+        match parts.next() {
+            Some("RESULT") => {
+                if let (Some(name), Some(score)) = (parts.next(), parts.next()) {
+                    if let Ok(score) = score.trim().parse::<f64>() {
+                        results.push(BenchResult::ok(name, score, UNIT));
+                    }
+                }
+            }
+            Some("ERROR") => {
+                if let Some(name) = parts.next() {
+                    let msg = parts.next().unwrap_or("").to_string();
+                    results.push(BenchResult::error(name, UNIT, msg));
+                }
+            }
+            Some("SCORE") => {
+                if let Some(score) = parts.next() {
+                    if let Ok(score) = score.trim().parse::<f64>() {
+                        summary = Some(BenchResult::ok("Total", score, UNIT));
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+
+    (results, summary)
+}
diff --git a/tests/perf/src/suites/web_tooling.rs b/tests/perf/src/suites/web_tooling.rs
new file mode 100644
index 00000000..2987753d
--- /dev/null
+++ b/tests/perf/src/suites/web_tooling.rs
@@ -0,0 +1,145 @@
+use std::path::Path;
+
+use regex::Regex;
+
+use crate::report::{BenchResult, SuiteRun};
+use crate::runner::run_bs;
+use crate::suite::{RunContext, SuiteSpec, first_nonempty_line, geomean_summary};
+
+const UNIT: &str = "runs/s";
+
+/// All individual benchmarks in this suite.
+const BENCHMARKS: &[&str] = &[
+    "acorn",
+    "babel",
+    "babel-minify",
+    "babylon",
+    "buble",
+    "chai",
+    "coffeescript",
+    "espree",
+    "esprima",
+    "jshint",
+    "lebab",
+    "postcss",
+    "prepack",
+    "prettier",
+    "source-map",
+    "terser",
+    "typescript",
+    "uglify-js",
+];
+
+pub struct WebTooling;
+
+impl SuiteSpec for WebTooling {
+    fn name(&self) -> &'static str {
+        "web-tooling"
+    }
+
+    fn vendor_subdir(&self) -> &'static str {
+        "web-tooling-benchmark"
+    }
+
+    fn is_available(&self, ctx: &RunContext) -> bool {
+        // The per-tool bundles are what we run; check the first as a sentinel.
+        ctx.suite_dir(self.vendor_subdir())
+            .join(bundle_name(BENCHMARKS[0]))
+            .is_file()
+    }
+
+    fn pretty_score_precision(&self) -> usize {
+        2
+    }
+
+    fn run(&self, ctx: &RunContext) -> SuiteRun {
+        let suite_dir = ctx.suite_dir(self.vendor_subdir());
+
+        // Run each benchmark in its own bundle (a fresh `bs` process). The combined bundle
+        // runs all 18 tools in one process, whose cumulative memory exceeds bs's heap by
+        // `typescript` and fatally OOMs, dropping the rest; per-tool runs avoid that and
+        // isolate failures.
+        let mut results = Vec::new();
+        let mut wall_clock_ms = 0.0;
+        for bench in BENCHMARKS
+            .iter()
+            .copied()
+            .filter(|bench| ctx.filter.matches(bench))
+        {
+            let bundle = suite_dir.join(bundle_name(bench));
+            let (result, ms) = run_one(ctx, bench, &bundle);
+            wall_clock_ms += ms;
+            results.push(result);
+        }
+
+        // Overall geometric mean only for a full run; a --bench subset isn't the suite metric.
+        let summary = if ctx.filter.is_empty() {
+            geomean_summary(&results)
+        } else {
+            None
+        };
+        ctx.suite_run(self, wall_clock_ms, results, summary)
+    }
+}
+
+/// Path (relative to the vendored dir) of the standalone bundle for `bench`.
+fn bundle_name(bench: &str) -> String {
+    format!("dist/cli-{bench}.js")
+}
+
+fn run_one(ctx: &RunContext, bench: &str, bundle: &Path) -> (BenchResult, f64) {
+    if !bundle.is_file() {
+        return (
+            BenchResult::error(bench, UNIT, "standalone bundle missing; re-run install.sh"),
+            0.0,
+        );
+    }
+
+    let output = match run_bs(
+        ctx.bs_path,
+        &flags(),
+        &[bundle.to_path_buf()],
+        ctx.flamegraph(Some(bench)).as_ref(),
+    ) {
+        Ok(o) => o,
+        Err(e) => {
+            return (BenchResult::error(bench, UNIT, format!("failed to spawn bs: {e}")), 0.0);
+        }
+    };
+
+    // Only one benchmark is run so take the single parsed result.
+    let (results, _) = parse_web_tooling_output(&output.stdout);
+    let result = results.into_iter().next().unwrap_or_else(|| {
+        let detail = first_nonempty_line(&output.stderr).unwrap_or("no result produced");
+        BenchResult::error(bench, UNIT, detail.to_string())
+    });
+    (result, output.wall_clock_ms)
+}
+
+/// Parse lines of the form `<tool>: <n> runs/s` plus the trailing geometric mean.
+fn parse_web_tooling_output(stdout: &str) -> (Vec<BenchResult>, Option<BenchResult>) {
+    let regex = Regex::new(r"^\s*(.+?):\s+([\d.]+)\s+runs/s").unwrap();
+    let mut results = Vec::new();
+    let mut summary = None;
+
+    for line in stdout.lines() {
+        if let Some(captures) = regex.captures(line) {
+            let name = captures[1].trim().to_string();
+            let score: f64 = match captures[2].parse() {
+                Ok(s) => s,
+                Err(_) => continue,
+            };
+            if name.eq_ignore_ascii_case("geometric mean") {
+                summary = Some(BenchResult::ok("Total", score, UNIT));
+            } else {
+                results.push(BenchResult::ok(name, score, UNIT));
+            }
+        }
+    }
+
+    (results, summary)
+}
+
+fn flags() -> Vec<String> {
+    vec!["--expose-test-shell-compat".to_string()]
+}