From 0470be1ca90b4766b3a8876a71671dd1efbd1d0f Mon Sep 17 00:00:00 2001
From: jayshah1819 <jayshah.jk.jk18@gmail.com>
Date: Sun, 19 Oct 2025 22:28:35 -0400
Subject: [PATCH] Add reduce primitive examples and performance testing

---
 examples/reduce_example.html |  64 +++++++
 examples/reduce_example.mjs  | 145 +++++++++++++++
 examples/reduce_perf.html    |  79 +++++++++
 examples/reduce_perf.mjs     | 330 +++++++++++++++++++++++++++++++++++
 4 files changed, 618 insertions(+)
 create mode 100644 examples/reduce_example.html
 create mode 100644 examples/reduce_example.mjs
 create mode 100644 examples/reduce_perf.html
 create mode 100644 examples/reduce_perf.mjs
diff --git a/examples/reduce_example.html b/examples/reduce_example.html
new file mode 100644
index 0000000..b55fef9
--- /dev/null
+++ b/examples/reduce_example.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8" />
+    <title>Standalone WebGPU Reduce Primitive Test</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" />
+</head>
+
+<body>
+    <h1>WebGPU Reduce Primitive Example</h1>
+    <p>
+        This example is a self-contained use of the <code>reduce</code> primitive.
+        It computes a sum-reduction on an input array of 2<sup>24</sup>
+        i32s, producing a single output value.
+        <a href="https://github.com/gridwise-webgpu/gridwise/blob/main/examples/reduce_example.mjs">The entire JS source
+            file is in github.</a>
+        While the entire file contains substantial WebGPU and input-output setup
+        boilerplate, the important parts follow.
+    </p>
+    <p>
+        First, we declare the reduce primitive. We configure its datatype (i32), the
+        binary operation for reduce (sum-reduction on i32s), and the type
+        (reduce).
+    </p>
+    <pre><code class="language-javascript">const datatype = "i32";
+const binop = new BinOpAdd({ datatype });
+const reducePrimitive = new DLDFScan({
+  device,
+  binop,
+  type: "reduce",
+  datatype,
+});
+const primitive = reducePrimitive;</code></pre>
+    <p>
+        We have declared buffers (using WebGPU's <code>device.createBuffer</code>)
+        called <code>memsrcBuffer</code> and <code>memdestBuffer</code>. Note that
+        for reduce, the output buffer is only 4 bytes (a single value). We then
+        call the primitive's <code>execute</code> procedure (note that
+        <code>execute</code> is <code>async</code>):
+    </p>
+    <pre><code class="language-javascript">await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});</code></pre>
+    <p>We then read back the result from the GPU and validate it.</p>
+    <p>
+        Your developer console should show a result that allows you to inspect the
+        input array and the single output value, and prints "Validation passed" if the output
+        matches the (CPU-computed) reference.
+    </p>
+    <pre><code>input Int32Array(16777216) [array of values...]
+output Int32Array(1) [reduced_value]
+Validation passed
+</code></pre>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script>
+        hljs.highlightAll();
+    </script>
+    <div id="plot"></div>
+    <script src="reduce_example.mjs" type="module"></script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/examples/reduce_example.mjs b/examples/reduce_example.mjs
new file mode 100644
index 0000000..a126cb6
--- /dev/null
+++ b/examples/reduce_example.mjs
@@ -0,0 +1,145 @@
+import { BinOpAdd, BinOpMax, BinOpMin } from "../binop.mjs";
+import { datatypeToTypedArray } from "../util.mjs";
+import { DLDFScan } from "../scandldf.mjs";
+
+export async function main(navigator) {
+    /* set up a WebGPU device */
+    const adapter = await navigator.gpu?.requestAdapter();
+    const hasSubgroups = adapter.features.has("subgroups");
+    const hasTimestampQuery = adapter.features.has("timestamp-query");
+    const device = await adapter?.requestDevice({
+        requiredFeatures: [
+            ...(hasTimestampQuery ? ["timestamp-query"] : []),
+            ...(hasSubgroups ? ["subgroups"] : []),
+        ],
+    });
+
+    if (!device) {
+        console.error("Fatal error: Device does not support WebGPU.");
+    }
+
+    /* configure the primitive */
+    /**
+     * Choices for configuring this primitive:
+     * datatype: "i32", "u32", "f32"
+     * binop: anything in binop.mjs
+     * - Make sure it's imported (at top of file)
+     * - BinOpMin, BinOpMax, BinOpAdd will work
+     * - BinOpMultiply will work but is likely to overflow
+     * inputLength: any multiple of 4 up to max GPUBuffer length
+     */
+    const datatype = "i32";
+    const binop = new BinOpAdd({ datatype });
+    const inputLength = 2 ** 24; // this is item count, not byte count
+
+    /* generate an input dataset */
+    if (inputLength % 4 !== 0) {
+        console.warn(
+            "Input length (currently: ",
+            inputLength,
+            ") must be divisible by 4 (output is likely to be incorrect) "
+        );
+    }
+    const memsrc = new (datatypeToTypedArray(datatype))(inputLength);
+    /* the gymnastics below are to try to generate GPU-native {i,u,f}32
+     * datatypes, there's probably an easier/faster way to do it */
+    for (let i = 0; i < inputLength; i++) {
+        switch (datatype) {
+            case "u32":
+                /* [0, 32], ints */
+                memsrc[i] = Math.floor(Math.random() * Math.pow(2, 5));
+                break;
+            case "i32":
+                /*  [-1024, 1024], ints */
+                memsrc[i] =
+                    (Math.random() < 0.5 ? 1 : -1) *
+                    Math.floor(Math.random() * Math.pow(2, 10));
+                break;
+            case "f32":
+                /* attempt to evenly distribute floats between [-1023, 1023] */
+                memsrc[i] =
+                    (Math.random() < 0.5 ? 1 : -1) *
+                    Math.floor(Math.random() * Math.pow(2, 10));
+                break;
+        }
+    }
+    console.log("input", memsrc);
+
+    /* declare the primitive */
+    const reducePrimitive = new DLDFScan({
+        device,
+        binop,
+        type: "reduce",
+        datatype,
+    });
+
+    const primitive = reducePrimitive;
+
+    /* size the output - reduce outputs a single value */
+    const memdestBytes = 4; // always 4 bytes for a single value
+
+    /* allocate/create buffers on the GPU to hold in/out data */
+    const memsrcBuffer = device.createBuffer({
+        label: `memory source buffer (${datatype})`,
+        size: memsrc.byteLength,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+    });
+    device.queue.writeBuffer(memsrcBuffer, 0, memsrc);
+
+    const memdestBuffer = device.createBuffer({
+        label: "memory destination buffer",
+        size: memdestBytes,
+        usage:
+            GPUBufferUsage.STORAGE |
+            GPUBufferUsage.COPY_SRC |
+            GPUBufferUsage.COPY_DST /* COPY_DST necessary for initialization */,
+    });
+
+    const mappableMemdestBuffer = device.createBuffer({
+        label: "mappable memory destination buffer",
+        size: memdestBytes,
+        usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
+    });
+
+    /* actually run the primitive */
+    await primitive.execute({
+        inputBuffer: memsrcBuffer,
+        outputBuffer: memdestBuffer,
+    });
+
+    /* copy output back to host */
+    const encoder = device.createCommandEncoder({
+        label: "copy result CPU->GPU encoder",
+    });
+    encoder.copyBufferToBuffer(
+        memdestBuffer,
+        0,
+        mappableMemdestBuffer,
+        0,
+        mappableMemdestBuffer.size
+    );
+    const commandBuffer = encoder.finish();
+    device.queue.submit([commandBuffer]);
+
+    await mappableMemdestBuffer.mapAsync(GPUMapMode.READ);
+    const memdest = new (datatypeToTypedArray(datatype))(
+        mappableMemdestBuffer.getMappedRange().slice()
+    );
+    mappableMemdestBuffer.unmap();
+
+    console.log("output", memdest);
+
+    if (primitive.validate) {
+        const errorstr = primitive.validate({
+            inputBuffer: memsrc,
+            outputBuffer: memdest,
+        });
+        if (errorstr === "") {
+            console.info("Validation passed");
+        } else {
+            console.error(`Validation failed:\n${errorstr}`);
+        }
+    }
+}
+
+main(navigator);
\ No newline at end of file
diff --git a/examples/reduce_perf.html b/examples/reduce_perf.html
new file mode 100644
index 0000000..0c842e9
--- /dev/null
+++ b/examples/reduce_perf.html
@@ -0,0 +1,79 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8" />
+    <title>
+        Standalone WebGPU Reduce Primitive Test, with Configuration Pane
+    </title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" />
+</head>
+
+<body>
+    <p>
+        This example is a self-contained use of the <code>reduce</code>
+        primitive, meant to plot performance. This builds on
+        the simpler
+        <a href="reduce_example.html">functionality example</a>. Set your
+        parameters in the pane and click "Start" to run and plot performance data
+        for a WebGPU reduce. The <code>inputCount</code> input specifies
+        how many different input lengths to run, which will be evenly
+        (logarithmically) interpolated between the specified start and end
+        lengths. Otherwise, the parameters are the same as in the
+        <a href="reduce_example.html">functionality example</a>. This
+        example explains
+        <a href="/gridwise/docs/gridwise/timing-strategy.html">how to time a Gridwise primitive</a>.
+        <a href="https://github.com/gridwise-webgpu/gridwise/blob/main/examples/reduceperf.mjs">The entire JS source
+            file is in github.</a>
+    </p>
+    <p>
+        To measure CPU and/or GPU timing, include a timing directive in the call
+        to <code>primitive.execute</code>. Typically we call the primitive once
+        without any timing information to handle warmup effects (e.g., compiling
+        the kernel) and then call the kernel many times and average the runtimes
+        of that second set of calls. We then average the total runtime over the
+        number of trials.
+    </p>
+    <pre><code class="language-javascript">/* call the primitive once to warm up */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});
+/* call params.trials times */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+  trials: params.trials, /* integer */
+  enableGPUTiming: true,
+  enableCPUTiming: true,
+});</code></pre>
+    <p>
+        We can get timing information back from the primitive with a `getResults`
+        call. The GPU time might be an array of timings if the GPU call has
+        multiple kernels within it. In the below example, we simply flatten that
+        array by adding it up into a total time.
+    </p>
+    <pre><code class="language-javascript">let { gpuTotalTimeNS, cpuTotalTimeNS } = await primitive.getTimingResult();
+if (gpuTotalTimeNS instanceof Array) {
+  // gpuTotalTimeNS might be a list, in which case just sum it up
+  gpuTotalTimeNS = gpuTotalTimeNS.reduce((x, a) => x + a, 0);
+}
+averageGpuTotalTimeNS = gpuTotalTimeNS / params.trials;
+averageCpuTotalTimeNS = cpuTotalTimeNS / params.trials;</code></pre>
+    <p>
+        The <code>reduce</code> primitive computes a single output value from an
+        input array using a binary operation (such as add, max, or min). This makes
+        it simpler to time than sort (which overwrites its input) since the input
+        remains unchanged after each execution.
+    </p>
+    <hr />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script>
+        hljs.highlightAll();
+    </script>
+    <script src="reduce_perf.mjs" type="module"></script>
+    <div id="webgpu-results"></div>
+    <div id="plot"></div>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/examples/reduce_perf.mjs b/examples/reduce_perf.mjs
new file mode 100644
index 0000000..ce220a2
--- /dev/null
+++ b/examples/reduce_perf.mjs
@@ -0,0 +1,330 @@
+import { Pane } from "https://cdn.jsdelivr.net/npm/tweakpane@4.0.5/dist/tweakpane.min.js";
+import { BinOpAdd, BinOpMax, BinOpMin, makeBinOp } from "../binop.mjs";
+import {
+    datatypeToTypedArray,
+    logspaceRounded,
+    datatypeToBytes,
+} from "../util.mjs";
+import { DLDFScan } from "../scandldf.mjs";
+
+let Plot = await import(
+    "https://cdn.jsdelivr.net/npm/@observablehq/plot@0.6/+esm"
+);
+
+/* set up a WebGPU device */
+const adapter = await navigator.gpu?.requestAdapter();
+const hasSubgroups = adapter.features.has("subgroups");
+const hasTimestampQuery = adapter.features.has("timestamp-query");
+const device = await adapter?.requestDevice({
+    requiredFeatures: [
+        ...(hasTimestampQuery ? ["timestamp-query"] : []),
+        ...(hasSubgroups ? ["subgroups"] : []),
+    ],
+});
+
+if (!device) {
+    console.error("Fatal error: Device does not support WebGPU.");
+}
+
+/* set up the UI, with parameters stored in the "params" object */
+const pane = new Pane();
+const params = {
+    /* defaults */
+    datatype: "u32",
+    binop: "add",
+    inputLengthStart: 2 ** 20,
+    inputLengthEnd: 2 ** 22,
+    inputCount: 3,
+    trials: 5,
+};
+
+pane.addBinding(params, "datatype", {
+    options: {
+        // what it shows : what it returns
+        u32: "u32",
+        i32: "i32",
+        f32: "f32",
+    },
+});
+
+pane.addBinding(params, "binop", {
+    options: {
+        // what it shows : what it returns
+        add: "add",
+        max: "max",
+        min: "min",
+    },
+});
+
+pane.addBinding(params, "inputLengthStart", { format: (v) => Math.floor(v) });
+pane.addBinding(params, "inputLengthEnd", { format: (v) => Math.floor(v) });
+pane.addBinding(params, "inputCount", { format: (v) => Math.floor(v) });
+pane.addBinding(params, "trials", { format: (v) => Math.floor(v) });
+
+const button = pane.addButton({
+    title: "Start",
+});
+
+button.on("click", async () => {
+    if (params.inputLengthStart % 4 !== 0) {
+        params.inputLengthStart = Math.floor(params.inputLengthStart / 4) * 4;
+    }
+    if (params.inputLengthEnd % 4 !== 0) {
+        params.inputLengthEnd = Math.floor(params.inputLengthEnd / 4) * 4;
+    }
+    params.inputCount = Math.floor(params.inputCount);
+    params.trials = Math.floor(params.trials);
+
+    /* because inputLength may change here, we need to refresh the pane */
+    pane.refresh();
+    const results = document.getElementById("webgpu-results");
+    const validation = await buildAndRun();
+    results.innerHTML = `<p>I ran this</p>
+  <ul>
+  <li>Primitive: reduce
+  <li>Datatype: ${params.datatype}
+  <li>Binop: ${params.binop}
+  <li>Input length: ${params.inputCount} lengths from ${params.inputLengthStart} to ${params.inputLengthEnd} (items)
+  </ul>
+  <p>${validation}</p>`;
+});
+/* end of setting up the UI */
+
+/* all of the work is in this function */
+async function buildAndRun() {
+    let returnStr = "";
+    const results = new Array(); // push new rows (experiments) onto this
+
+    for (const inputLength of logspaceRounded(
+        params.inputLengthStart,
+        params.inputLengthEnd,
+        params.inputCount
+    )) {
+        /* generate an input dataset */
+        const memsrc = new (datatypeToTypedArray(params.datatype))(inputLength);
+
+        /* generate ~random input datasets that are friendly for reduce */
+        for (let i = 0; i < inputLength; i++) {
+            switch (params.datatype) {
+                case "u32":
+                    /* roughly, [0, 32], ints */
+                    memsrc[i] = Math.floor(Math.random() * Math.pow(2, 5));
+                    break;
+                case "f32":
+                case "i32":
+                    /* roughly, [-1024, 1024], ints */
+                    memsrc[i] =
+                        (Math.random() < 0.5 ? 1 : -1) *
+                        Math.floor(Math.random() * Math.pow(2, 10));
+                    break;
+            }
+        }
+        console.log("input array", memsrc);
+
+        /* declare the primitive */
+        const primitive = new DLDFScan({
+            device,
+            binop: makeBinOp({ op: params.binop, datatype: params.datatype }),
+            type: "reduce",
+            datatype: params.datatype,
+        });
+
+        /* size the output - reduce always outputs a single value (4 bytes) */
+        const memdestBytes = 4;
+
+        /* allocate/create buffers on the GPU to hold in/out data */
+        const memsrcBuffer = device.createBuffer({
+            label: `memory source buffer (${params.datatype})`,
+            size: memsrc.byteLength,
+            usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+        });
+        device.queue.writeBuffer(memsrcBuffer, 0, memsrc);
+
+        const memdestBuffer = device.createBuffer({
+            label: "memory destination buffer",
+            size: memdestBytes,
+            usage:
+                GPUBufferUsage.STORAGE |
+                GPUBufferUsage.COPY_SRC |
+                GPUBufferUsage.COPY_DST /* COPY_DST necessary for initialization */,
+        });
+
+        const mappableMemdestBuffer = device.createBuffer({
+            label: "mappable memory destination buffer",
+            size: memdestBytes,
+            usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
+        });
+
+        /* actually run the primitive */
+        const primitiveOptions = {
+            trials: params.trials,
+            enableGPUTiming: hasTimestampQuery,
+            enableCPUTiming: true,
+        };
+
+        /* call once, ignore result (warmup) */
+        await primitive.execute({
+            inputBuffer: memsrcBuffer,
+            outputBuffer: memdestBuffer,
+        });
+
+        /* call trials times */
+        await primitive.execute({
+            inputBuffer: memsrcBuffer,
+            outputBuffer: memdestBuffer,
+            ...primitiveOptions,
+        });
+
+        /* Append CPU and GPU timing results to "results" array */
+        let { gpuTotalTimeNS, cpuTotalTimeNS } = await primitive.getTimingResult();
+        console.log(gpuTotalTimeNS, cpuTotalTimeNS);
+        const result = {};
+        if (gpuTotalTimeNS instanceof Array) {
+            // gpuTotalTimeNS might be a list, in which case just sum it up
+            result.gpuTotalTimeNSArray = gpuTotalTimeNS;
+            gpuTotalTimeNS = gpuTotalTimeNS.reduce((x, a) => x + a, 0);
+        }
+        result.gputime = gpuTotalTimeNS / params.trials;
+        result.cputime = cpuTotalTimeNS / params.trials;
+        result.inputBytes = inputLength * datatypeToBytes(primitive.datatype);
+        result.bandwidthGPU = primitive.bytesTransferred / result.gputime;
+        result.bandwidthCPU = primitive.bytesTransferred / result.cputime;
+        result.inputItemsPerSecondE9GPU = inputLength / result.gputime;
+        result.inputItemsPerSecondE9CPU = inputLength / result.cputime;
+
+        results.push({
+            ...result,
+            timing: "GPU",
+            time: result.gputime,
+            bandwidth: result.bandwidthGPU,
+            inputItemsPerSecondE9: result.inputItemsPerSecondE9GPU,
+        });
+        results.push({
+            ...result,
+            timing: "CPU",
+            time: result.cputime,
+            bandwidth: result.bandwidthCPU,
+            inputItemsPerSecondE9: result.inputItemsPerSecondE9CPU,
+        });
+
+        /* copy output back to host */
+        const encoder = device.createCommandEncoder({
+            label: "copy result CPU->GPU encoder",
+        });
+        encoder.copyBufferToBuffer(
+            memdestBuffer,
+            0,
+            mappableMemdestBuffer,
+            0,
+            mappableMemdestBuffer.size
+        );
+        const commandBuffer = encoder.finish();
+        device.queue.submit([commandBuffer]);
+
+        await mappableMemdestBuffer.mapAsync(GPUMapMode.READ);
+        const memdest = new (datatypeToTypedArray(params.datatype))(
+            mappableMemdestBuffer.getMappedRange().slice()
+        );
+        mappableMemdestBuffer.unmap();
+
+        console.log("output array", memdest);
+
+        if (primitive.validate) {
+            const errorstr = primitive.validate({
+                inputBuffer: memsrc,
+                outputBuffer: memdest,
+            });
+
+            if (errorstr === "") {
+                returnStr += `Validation passed (input length: ${inputLength})<br/>`;
+            } else {
+                returnStr += `Validation failed (input length: ${inputLength})<br/>${errorstr}<br/>`;
+            }
+        } else {
+            returnStr += `Validation not performed (input length: ${inputLength})<br/>`;
+        }
+    } /* end loop over input lengths */
+    plotResults(results);
+    return returnStr;
+}
+
+function plotResults(results) {
+    console.log(results);
+    const plots = [
+        {
+            x: { field: "inputBytes", label: "Input array size (B)" },
+            y: { field: "bandwidth", label: "Bandwidth (GB/s)" },
+            stroke: { field: "timing" },
+            caption:
+                "BANDWIDTH | CPU timing (performance.now), GPU timing (timestamps)",
+        },
+        {
+            x: { field: "inputBytes", label: "Input array size (B)" },
+            y: { field: "time", label: "Runtime (ns)" },
+            stroke: { field: "timing" },
+            caption:
+                "RUNTIME | CPU timing (performance.now), GPU timing (timestamps)",
+        },
+    ];
+    for (let plot of plots) {
+        const mark = plot.mark ?? "lineY";
+        const schema = {
+            marks: [
+                Plot[mark](results, {
+                    x: plot.x.field,
+                    y: plot.y.field,
+                    ...("fx" in plot && { fx: plot.fx.field }),
+                    ...("fy" in plot && { fy: plot.fy.field }),
+                    ...("stroke" in plot && {
+                        stroke: plot.stroke.field,
+                    }),
+                    tip: true,
+                }),
+                Plot.text(
+                    results,
+                    Plot.selectLast({
+                        x: plot.x.field,
+                        y: plot.y.field,
+                        ...("stroke" in plot && {
+                            z: plot.stroke.field,
+                            text: plot.stroke.field,
+                        }),
+                        ...("fx" in plot && { fx: plot.fx.field }),
+                        ...("fy" in plot && { fy: plot.fy.field }),
+                        textAnchor: "start",
+                        clip: false,
+                        dx: 3,
+                    })
+                ),
+                Plot.text([plot.text_tl ?? ""], {
+                    lineWidth: 30,
+                    dx: 5,
+                    frameAnchor: "top-left",
+                }),
+                Plot.text(plot.text_br ?? "", {
+                    lineWidth: 30,
+                    dx: 5,
+                    frameAnchor: "bottom-right",
+                }),
+            ],
+            x: { type: "log", label: plot?.x?.label ?? "XLABEL" },
+            y: { type: "log", label: plot?.y?.label ?? "YLABEL" },
+            ...("fx" in plot && {
+                fx: { label: plot.fx.label },
+            }),
+            ...("fy" in plot && {
+                fy: { label: plot.fy.label },
+            }),
+            ...(("fx" in plot || "fy" in plot) && { grid: true }),
+            color: { type: "ordinal", legend: true },
+            width: 1280,
+            title: plot?.title,
+            subtitle: plot?.subtitle,
+            caption: plot?.caption,
+        };
+        const plotted = Plot.plot(schema);
+        const div = document.querySelector("#plot");
+        div.append(plotted);
+        div.append(document.createElement("hr"));
+    }
+}
\ No newline at end of file