gridwise-webgpu · jayshah1819 · Oct 20, 2025
diff --git a/examples/reduce_example.html b/examples/reduce_example.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8" />
+    <title>Standalone WebGPU Reduce Primitive Test</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" />
+</head>
+
+<body>
+    <h1>WebGPU Reduce Primitive Example</h1>
+    <p>
+        This example is a self-contained use of the <code>reduce</code> primitive.
+        It computes a sum-reduction on an input array of 2<sup>24</sup>
+        i32s, producing a single output value.
+        <a href="https://github.com/gridwise-webgpu/gridwise/blob/main/examples/reduce_example.mjs">The entire JS source
+            file is in github.</a>
+        While the entire file contains substantial WebGPU and input-output setup
+        boilerplate, the important parts follow.
+    </p>
+    <p>
+        First, we declare the reduce primitive. We configure its datatype (i32), the
+        binary operation for reduce (sum-reduction on i32s), and the type
+        (reduce).
+    </p>
+    <pre><code class="language-javascript">const datatype = "i32";
+const binop = new BinOpAdd({ datatype });
+const reducePrimitive = new DLDFScan({
+  device,
+  binop,
+  type: "reduce",
+  datatype,
+});
+const primitive = reducePrimitive;</code></pre>
+    <p>
+        We have declared buffers (using WebGPU's <code>device.createBuffer</code>)
+        called <code>memsrcBuffer</code> and <code>memdestBuffer</code>. Note that
+        for reduce, the output buffer is only 4 bytes (a single value). We then
+        call the primitive's <code>execute</code> procedure (note that
+        <code>execute</code> is <code>async</code>):
+    </p>
+    <pre><code class="language-javascript">await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});</code></pre>
+    <p>We then read back the result from the GPU and validate it.</p>
+    <p>
+        Your developer console should show a result that allows you to inspect the
+        input array and the single output value, and prints "Validation passed" if the output
+        matches the (CPU-computed) reference.
+    </p>
+    <pre><code>input Int32Array(16777216) [array of values...]
+output Int32Array(1) [reduced_value]
+Validation passed
+</code></pre>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script>
+        hljs.highlightAll();
+    </script>
+    <div id="plot"></div>
+    <script src="reduce_example.mjs" type="module"></script>
+</body>
+
+</html>
diff --git a/examples/reduce_example.mjs b/examples/reduce_example.mjs
@@ -0,0 +1,145 @@
+import { BinOpAdd, BinOpMax, BinOpMin } from "../binop.mjs";
+import { datatypeToTypedArray } from "../util.mjs";
+import { DLDFScan } from "../scandldf.mjs";
+
+export async function main(navigator) {
+    /* set up a WebGPU device */
+    const adapter = await navigator.gpu?.requestAdapter();
+    const hasSubgroups = adapter.features.has("subgroups");
+    const hasTimestampQuery = adapter.features.has("timestamp-query");
+    const device = await adapter?.requestDevice({
+        requiredFeatures: [
+            ...(hasTimestampQuery ? ["timestamp-query"] : []),
+            ...(hasSubgroups ? ["subgroups"] : []),
+        ],
+    });
+
+    if (!device) {
+        console.error("Fatal error: Device does not support WebGPU.");
+    }
+
+    /* configure the primitive */
+    /**
+     * Choices for configuring this primitive:
+     * datatype: "i32", "u32", "f32"
+     * binop: anything in binop.mjs
+     * - Make sure it's imported (at top of file)
+     * - BinOpMin, BinOpMax, BinOpAdd will work
+     * - BinOpMultiply will work but is likely to overflow
+     * inputLength: any multiple of 4 up to max GPUBuffer length
+     */
+    const datatype = "i32";
+    const binop = new BinOpAdd({ datatype });
+    const inputLength = 2 ** 24; // this is item count, not byte count
+
+    /* generate an input dataset */
+    if (inputLength % 4 !== 0) {
+        console.warn(
+            "Input length (currently: ",
+            inputLength,
+            ") must be divisible by 4 (output is likely to be incorrect) "
+        );
+    }
+    const memsrc = new (datatypeToTypedArray(datatype))(inputLength);
+    /* the gymnastics below are to try to generate GPU-native {i,u,f}32
+     * datatypes, there's probably an easier/faster way to do it */
+    for (let i = 0; i < inputLength; i++) {
+        switch (datatype) {
+            case "u32":
+                /* [0, 32], ints */
+                memsrc[i] = Math.floor(Math.random() * Math.pow(2, 5));
+                break;
+            case "i32":
+                /*  [-1024, 1024], ints */
+                memsrc[i] =
+                    (Math.random() < 0.5 ? 1 : -1) *
+                    Math.floor(Math.random() * Math.pow(2, 10));
+                break;
+            case "f32":
+                /* attempt to evenly distribute floats between [-1023, 1023] */
+                memsrc[i] =
+                    (Math.random() < 0.5 ? 1 : -1) *
+                    Math.floor(Math.random() * Math.pow(2, 10));
+                break;
+        }
+    }
+    console.log("input", memsrc);
+
+    /* declare the primitive */
+    const reducePrimitive = new DLDFScan({
+        device,
+        binop,
+        type: "reduce",
+        datatype,
+    });
+
+    const primitive = reducePrimitive;
+
+    /* size the output - reduce outputs a single value */
+    const memdestBytes = 4; // always 4 bytes for a single value
+
+    /* allocate/create buffers on the GPU to hold in/out data */
+    const memsrcBuffer = device.createBuffer({
+        label: `memory source buffer (${datatype})`,
+        size: memsrc.byteLength,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+    });
+    device.queue.writeBuffer(memsrcBuffer, 0, memsrc);
+
+    const memdestBuffer = device.createBuffer({
+        label: "memory destination buffer",
+        size: memdestBytes,
+        usage:
+            GPUBufferUsage.STORAGE |
+            GPUBufferUsage.COPY_SRC |
+            GPUBufferUsage.COPY_DST /* COPY_DST necessary for initialization */,
+    });
+
+    const mappableMemdestBuffer = device.createBuffer({
+        label: "mappable memory destination buffer",
+        size: memdestBytes,
+        usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
+    });
+
+    /* actually run the primitive */
+    await primitive.execute({
+        inputBuffer: memsrcBuffer,
+        outputBuffer: memdestBuffer,
+    });
+
+    /* copy output back to host */
+    const encoder = device.createCommandEncoder({
+        label: "copy result CPU->GPU encoder",
+    });
+    encoder.copyBufferToBuffer(
+        memdestBuffer,
+        0,
+        mappableMemdestBuffer,
+        0,
+        mappableMemdestBuffer.size
+    );
+    const commandBuffer = encoder.finish();
+    device.queue.submit([commandBuffer]);
+
+    await mappableMemdestBuffer.mapAsync(GPUMapMode.READ);
+    const memdest = new (datatypeToTypedArray(datatype))(
+        mappableMemdestBuffer.getMappedRange().slice()
+    );
+    mappableMemdestBuffer.unmap();
+
+    console.log("output", memdest);
+
+    if (primitive.validate) {
+        const errorstr = primitive.validate({
+            inputBuffer: memsrc,
+            outputBuffer: memdest,
+        });
+        if (errorstr === "") {
+            console.info("Validation passed");
+        } else {
+            console.error(`Validation failed:\n${errorstr}`);
+        }
+    }
+}
+
+main(navigator);
diff --git a/examples/reduce_perf.html b/examples/reduce_perf.html
@@ -0,0 +1,79 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <meta charset="utf-8" />
+    <title>
+        Standalone WebGPU Reduce Primitive Test, with Configuration Pane
+    </title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/default.min.css" />
+</head>
+
+<body>
+    <p>
+        This example is a self-contained use of the <code>reduce</code>
+        primitive, meant to plot performance. This builds on
+        the simpler
+        <a href="reduce_example.html">functionality example</a>. Set your
+        parameters in the pane and click "Start" to run and plot performance data
+        for a WebGPU reduce. The <code>inputCount</code> input specifies
+        how many different input lengths to run, which will be evenly
+        (logarithmically) interpolated between the specified start and end
+        lengths. Otherwise, the parameters are the same as in the
+        <a href="reduce_example.html">functionality example</a>. This
+        example explains
+        <a href="/gridwise/docs/gridwise/timing-strategy.html">how to time a Gridwise primitive</a>.
+        <a href="https://github.com/gridwise-webgpu/gridwise/blob/main/examples/reduceperf.mjs">The entire JS source
+            file is in github.</a>
+    </p>
+    <p>
+        To measure CPU and/or GPU timing, include a timing directive in the call
+        to <code>primitive.execute</code>. Typically we call the primitive once
+        without any timing information to handle warmup effects (e.g., compiling
+        the kernel) and then call the kernel many times and average the runtimes
+        of that second set of calls. We then average the total runtime over the
+        number of trials.
+    </p>
+    <pre><code class="language-javascript">/* call the primitive once to warm up */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});
+/* call params.trials times */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+  trials: params.trials, /* integer */
+  enableGPUTiming: true,
+  enableCPUTiming: true,
+});</code></pre>
+    <p>
+        We can get timing information back from the primitive with a `getResults`
+        call. The GPU time might be an array of timings if the GPU call has
+        multiple kernels within it. In the below example, we simply flatten that
+        array by adding it up into a total time.
+    </p>
+    <pre><code class="language-javascript">let { gpuTotalTimeNS, cpuTotalTimeNS } = await primitive.getTimingResult();
+if (gpuTotalTimeNS instanceof Array) {
+  // gpuTotalTimeNS might be a list, in which case just sum it up
+  gpuTotalTimeNS = gpuTotalTimeNS.reduce((x, a) => x + a, 0);
+}
+averageGpuTotalTimeNS = gpuTotalTimeNS / params.trials;
+averageCpuTotalTimeNS = cpuTotalTimeNS / params.trials;</code></pre>
+    <p>
+        The <code>reduce</code> primitive computes a single output value from an
+        input array using a binary operation (such as add, max, or min). This makes
+        it simpler to time than sort (which overwrites its input) since the input
+        remains unchanged after each execution.
+    </p>
+    <hr />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script>
+        hljs.highlightAll();
+    </script>
+    <script src="reduce_perf.mjs" type="module"></script>
+    <div id="webgpu-results"></div>
+    <div id="plot"></div>
+</body>
+
+</html>