From 0470be1ca90b4766b3a8876a71671dd1efbd1d0f Mon Sep 17 00:00:00 2001 From: jayshah1819 Date: Sun, 19 Oct 2025 22:28:35 -0400 Subject: [PATCH] Add reduce primitive examples and performance testing --- examples/reduce_example.html | 64 +++++++ examples/reduce_example.mjs | 145 +++++++++++++++ examples/reduce_perf.html | 79 +++++++++ examples/reduce_perf.mjs | 330 +++++++++++++++++++++++++++++++++++ 4 files changed, 618 insertions(+) create mode 100644 examples/reduce_example.html create mode 100644 examples/reduce_example.mjs create mode 100644 examples/reduce_perf.html create mode 100644 examples/reduce_perf.mjs diff --git a/examples/reduce_example.html b/examples/reduce_example.html new file mode 100644 index 0000000..b55fef9 --- /dev/null +++ b/examples/reduce_example.html @@ -0,0 +1,64 @@ + + + + + + Standalone WebGPU Reduce Primitive Test + + + + +

WebGPU Reduce Primitive Example

+

+ This example is a self-contained use of the reduce primitive. + It computes a sum-reduction on an input array of 224 + i32s, producing a single output value. + The entire JS source + file is in github. + While the entire file contains substantial WebGPU and input-output setup + boilerplate, the important parts follow. +

+

+ First, we declare the reduce primitive. We configure its datatype (i32), the + binary operation for reduce (sum-reduction on i32s), and the type + (reduce). +

+
const datatype = "i32";
+const binop = new BinOpAdd({ datatype });
+const reducePrimitive = new DLDFScan({
+  device,
+  binop,
+  type: "reduce",
+  datatype,
+});
+const primitive = reducePrimitive;
+

+ We have declared buffers (using WebGPU's device.createBuffer) + called memsrcBuffer and memdestBuffer. Note that + for reduce, the output buffer is only 4 bytes (a single value). We then + call the primitive's execute procedure (note that + execute is async): +

+
await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});
+

We then read back the result from the GPU and validate it.

+

+ Your developer console should show a result that allows you to inspect the + input array and the single output value, and prints "Validation passed" if the output + matches the (CPU-computed) reference. +

+
input Int32Array(16777216) [array of values...]
+output Int32Array(1) [reduced_value]
+Validation passed
+
+ + +
+ + + + \ No newline at end of file diff --git a/examples/reduce_example.mjs b/examples/reduce_example.mjs new file mode 100644 index 0000000..a126cb6 --- /dev/null +++ b/examples/reduce_example.mjs @@ -0,0 +1,145 @@ +import { BinOpAdd, BinOpMax, BinOpMin } from "../binop.mjs"; +import { datatypeToTypedArray } from "../util.mjs"; +import { DLDFScan } from "../scandldf.mjs"; + +export async function main(navigator) { + /* set up a WebGPU device */ + const adapter = await navigator.gpu?.requestAdapter(); + const hasSubgroups = adapter.features.has("subgroups"); + const hasTimestampQuery = adapter.features.has("timestamp-query"); + const device = await adapter?.requestDevice({ + requiredFeatures: [ + ...(hasTimestampQuery ? ["timestamp-query"] : []), + ...(hasSubgroups ? ["subgroups"] : []), + ], + }); + + if (!device) { + console.error("Fatal error: Device does not support WebGPU."); + } + + /* configure the primitive */ + /** + * Choices for configuring this primitive: + * datatype: "i32", "u32", "f32" + * binop: anything in binop.mjs + * - Make sure it's imported (at top of file) + * - BinOpMin, BinOpMax, BinOpAdd will work + * - BinOpMultiply will work but is likely to overflow + * inputLength: any multiple of 4 up to max GPUBuffer length + */ + const datatype = "i32"; + const binop = new BinOpAdd({ datatype }); + const inputLength = 2 ** 24; // this is item count, not byte count + + /* generate an input dataset */ + if (inputLength % 4 !== 0) { + console.warn( + "Input length (currently: ", + inputLength, + ") must be divisible by 4 (output is likely to be incorrect) " + ); + } + const memsrc = new (datatypeToTypedArray(datatype))(inputLength); + /* the gymnastics below are to try to generate GPU-native {i,u,f}32 + * datatypes, there's probably an easier/faster way to do it */ + for (let i = 0; i < inputLength; i++) { + switch (datatype) { + case "u32": + /* [0, 32], ints */ + memsrc[i] = Math.floor(Math.random() * Math.pow(2, 5)); + break; + case "i32": + /* [-1024, 1024], ints */ + memsrc[i] = + (Math.random() < 0.5 ? 1 : -1) * + Math.floor(Math.random() * Math.pow(2, 10)); + break; + case "f32": + /* attempt to evenly distribute floats between [-1023, 1023] */ + memsrc[i] = + (Math.random() < 0.5 ? 1 : -1) * + Math.floor(Math.random() * Math.pow(2, 10)); + break; + } + } + console.log("input", memsrc); + + /* declare the primitive */ + const reducePrimitive = new DLDFScan({ + device, + binop, + type: "reduce", + datatype, + }); + + const primitive = reducePrimitive; + + /* size the output - reduce outputs a single value */ + const memdestBytes = 4; // always 4 bytes for a single value + + /* allocate/create buffers on the GPU to hold in/out data */ + const memsrcBuffer = device.createBuffer({ + label: `memory source buffer (${datatype})`, + size: memsrc.byteLength, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST, + }); + device.queue.writeBuffer(memsrcBuffer, 0, memsrc); + + const memdestBuffer = device.createBuffer({ + label: "memory destination buffer", + size: memdestBytes, + usage: + GPUBufferUsage.STORAGE | + GPUBufferUsage.COPY_SRC | + GPUBufferUsage.COPY_DST /* COPY_DST necessary for initialization */, + }); + + const mappableMemdestBuffer = device.createBuffer({ + label: "mappable memory destination buffer", + size: memdestBytes, + usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, + }); + + /* actually run the primitive */ + await primitive.execute({ + inputBuffer: memsrcBuffer, + outputBuffer: memdestBuffer, + }); + + /* copy output back to host */ + const encoder = device.createCommandEncoder({ + label: "copy result CPU->GPU encoder", + }); + encoder.copyBufferToBuffer( + memdestBuffer, + 0, + mappableMemdestBuffer, + 0, + mappableMemdestBuffer.size + ); + const commandBuffer = encoder.finish(); + device.queue.submit([commandBuffer]); + + await mappableMemdestBuffer.mapAsync(GPUMapMode.READ); + const memdest = new (datatypeToTypedArray(datatype))( + mappableMemdestBuffer.getMappedRange().slice() + ); + mappableMemdestBuffer.unmap(); + + console.log("output", memdest); + + if (primitive.validate) { + const errorstr = primitive.validate({ + inputBuffer: memsrc, + outputBuffer: memdest, + }); + if (errorstr === "") { + console.info("Validation passed"); + } else { + console.error(`Validation failed:\n${errorstr}`); + } + } +} + +main(navigator); \ No newline at end of file diff --git a/examples/reduce_perf.html b/examples/reduce_perf.html new file mode 100644 index 0000000..0c842e9 --- /dev/null +++ b/examples/reduce_perf.html @@ -0,0 +1,79 @@ + + + + + + + Standalone WebGPU Reduce Primitive Test, with Configuration Pane + + + + + +

+ This example is a self-contained use of the reduce + primitive, meant to plot performance. This builds on + the simpler + functionality example. Set your + parameters in the pane and click "Start" to run and plot performance data + for a WebGPU reduce. The inputCount input specifies + how many different input lengths to run, which will be evenly + (logarithmically) interpolated between the specified start and end + lengths. Otherwise, the parameters are the same as in the + functionality example. This + example explains + how to time a Gridwise primitive. + The entire JS source + file is in github. +

+

+ To measure CPU and/or GPU timing, include a timing directive in the call + to primitive.execute. Typically we call the primitive once + without any timing information to handle warmup effects (e.g., compiling + the kernel) and then call the kernel many times and average the runtimes + of that second set of calls. We then average the total runtime over the + number of trials. +

+
/* call the primitive once to warm up */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+});
+/* call params.trials times */
+await primitive.execute({
+  inputBuffer: memsrcBuffer,
+  outputBuffer: memdestBuffer,
+  trials: params.trials, /* integer */
+  enableGPUTiming: true,
+  enableCPUTiming: true,
+});
+

+ We can get timing information back from the primitive with a `getResults` + call. The GPU time might be an array of timings if the GPU call has + multiple kernels within it. In the below example, we simply flatten that + array by adding it up into a total time. +

+
let { gpuTotalTimeNS, cpuTotalTimeNS } = await primitive.getTimingResult();
+if (gpuTotalTimeNS instanceof Array) {
+  // gpuTotalTimeNS might be a list, in which case just sum it up
+  gpuTotalTimeNS = gpuTotalTimeNS.reduce((x, a) => x + a, 0);
+}
+averageGpuTotalTimeNS = gpuTotalTimeNS / params.trials;
+averageCpuTotalTimeNS = cpuTotalTimeNS / params.trials;
+

+ The reduce primitive computes a single output value from an + input array using a binary operation (such as add, max, or min). This makes + it simpler to time than sort (which overwrites its input) since the input + remains unchanged after each execution. +

+
+ + + +
+
+ + + \ No newline at end of file diff --git a/examples/reduce_perf.mjs b/examples/reduce_perf.mjs new file mode 100644 index 0000000..ce220a2 --- /dev/null +++ b/examples/reduce_perf.mjs @@ -0,0 +1,330 @@ +import { Pane } from "https://cdn.jsdelivr.net/npm/tweakpane@4.0.5/dist/tweakpane.min.js"; +import { BinOpAdd, BinOpMax, BinOpMin, makeBinOp } from "../binop.mjs"; +import { + datatypeToTypedArray, + logspaceRounded, + datatypeToBytes, +} from "../util.mjs"; +import { DLDFScan } from "../scandldf.mjs"; + +let Plot = await import( + "https://cdn.jsdelivr.net/npm/@observablehq/plot@0.6/+esm" +); + +/* set up a WebGPU device */ +const adapter = await navigator.gpu?.requestAdapter(); +const hasSubgroups = adapter.features.has("subgroups"); +const hasTimestampQuery = adapter.features.has("timestamp-query"); +const device = await adapter?.requestDevice({ + requiredFeatures: [ + ...(hasTimestampQuery ? ["timestamp-query"] : []), + ...(hasSubgroups ? ["subgroups"] : []), + ], +}); + +if (!device) { + console.error("Fatal error: Device does not support WebGPU."); +} + +/* set up the UI, with parameters stored in the "params" object */ +const pane = new Pane(); +const params = { + /* defaults */ + datatype: "u32", + binop: "add", + inputLengthStart: 2 ** 20, + inputLengthEnd: 2 ** 22, + inputCount: 3, + trials: 5, +}; + +pane.addBinding(params, "datatype", { + options: { + // what it shows : what it returns + u32: "u32", + i32: "i32", + f32: "f32", + }, +}); + +pane.addBinding(params, "binop", { + options: { + // what it shows : what it returns + add: "add", + max: "max", + min: "min", + }, +}); + +pane.addBinding(params, "inputLengthStart", { format: (v) => Math.floor(v) }); +pane.addBinding(params, "inputLengthEnd", { format: (v) => Math.floor(v) }); +pane.addBinding(params, "inputCount", { format: (v) => Math.floor(v) }); +pane.addBinding(params, "trials", { format: (v) => Math.floor(v) }); + +const button = pane.addButton({ + title: "Start", +}); + +button.on("click", async () => { + if (params.inputLengthStart % 4 !== 0) { + params.inputLengthStart = Math.floor(params.inputLengthStart / 4) * 4; + } + if (params.inputLengthEnd % 4 !== 0) { + params.inputLengthEnd = Math.floor(params.inputLengthEnd / 4) * 4; + } + params.inputCount = Math.floor(params.inputCount); + params.trials = Math.floor(params.trials); + + /* because inputLength may change here, we need to refresh the pane */ + pane.refresh(); + const results = document.getElementById("webgpu-results"); + const validation = await buildAndRun(); + results.innerHTML = `

I ran this

+ +

${validation}

`; +}); +/* end of setting up the UI */ + +/* all of the work is in this function */ +async function buildAndRun() { + let returnStr = ""; + const results = new Array(); // push new rows (experiments) onto this + + for (const inputLength of logspaceRounded( + params.inputLengthStart, + params.inputLengthEnd, + params.inputCount + )) { + /* generate an input dataset */ + const memsrc = new (datatypeToTypedArray(params.datatype))(inputLength); + + /* generate ~random input datasets that are friendly for reduce */ + for (let i = 0; i < inputLength; i++) { + switch (params.datatype) { + case "u32": + /* roughly, [0, 32], ints */ + memsrc[i] = Math.floor(Math.random() * Math.pow(2, 5)); + break; + case "f32": + case "i32": + /* roughly, [-1024, 1024], ints */ + memsrc[i] = + (Math.random() < 0.5 ? 1 : -1) * + Math.floor(Math.random() * Math.pow(2, 10)); + break; + } + } + console.log("input array", memsrc); + + /* declare the primitive */ + const primitive = new DLDFScan({ + device, + binop: makeBinOp({ op: params.binop, datatype: params.datatype }), + type: "reduce", + datatype: params.datatype, + }); + + /* size the output - reduce always outputs a single value (4 bytes) */ + const memdestBytes = 4; + + /* allocate/create buffers on the GPU to hold in/out data */ + const memsrcBuffer = device.createBuffer({ + label: `memory source buffer (${params.datatype})`, + size: memsrc.byteLength, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST, + }); + device.queue.writeBuffer(memsrcBuffer, 0, memsrc); + + const memdestBuffer = device.createBuffer({ + label: "memory destination buffer", + size: memdestBytes, + usage: + GPUBufferUsage.STORAGE | + GPUBufferUsage.COPY_SRC | + GPUBufferUsage.COPY_DST /* COPY_DST necessary for initialization */, + }); + + const mappableMemdestBuffer = device.createBuffer({ + label: "mappable memory destination buffer", + size: memdestBytes, + usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, + }); + + /* actually run the primitive */ + const primitiveOptions = { + trials: params.trials, + enableGPUTiming: hasTimestampQuery, + enableCPUTiming: true, + }; + + /* call once, ignore result (warmup) */ + await primitive.execute({ + inputBuffer: memsrcBuffer, + outputBuffer: memdestBuffer, + }); + + /* call trials times */ + await primitive.execute({ + inputBuffer: memsrcBuffer, + outputBuffer: memdestBuffer, + ...primitiveOptions, + }); + + /* Append CPU and GPU timing results to "results" array */ + let { gpuTotalTimeNS, cpuTotalTimeNS } = await primitive.getTimingResult(); + console.log(gpuTotalTimeNS, cpuTotalTimeNS); + const result = {}; + if (gpuTotalTimeNS instanceof Array) { + // gpuTotalTimeNS might be a list, in which case just sum it up + result.gpuTotalTimeNSArray = gpuTotalTimeNS; + gpuTotalTimeNS = gpuTotalTimeNS.reduce((x, a) => x + a, 0); + } + result.gputime = gpuTotalTimeNS / params.trials; + result.cputime = cpuTotalTimeNS / params.trials; + result.inputBytes = inputLength * datatypeToBytes(primitive.datatype); + result.bandwidthGPU = primitive.bytesTransferred / result.gputime; + result.bandwidthCPU = primitive.bytesTransferred / result.cputime; + result.inputItemsPerSecondE9GPU = inputLength / result.gputime; + result.inputItemsPerSecondE9CPU = inputLength / result.cputime; + + results.push({ + ...result, + timing: "GPU", + time: result.gputime, + bandwidth: result.bandwidthGPU, + inputItemsPerSecondE9: result.inputItemsPerSecondE9GPU, + }); + results.push({ + ...result, + timing: "CPU", + time: result.cputime, + bandwidth: result.bandwidthCPU, + inputItemsPerSecondE9: result.inputItemsPerSecondE9CPU, + }); + + /* copy output back to host */ + const encoder = device.createCommandEncoder({ + label: "copy result CPU->GPU encoder", + }); + encoder.copyBufferToBuffer( + memdestBuffer, + 0, + mappableMemdestBuffer, + 0, + mappableMemdestBuffer.size + ); + const commandBuffer = encoder.finish(); + device.queue.submit([commandBuffer]); + + await mappableMemdestBuffer.mapAsync(GPUMapMode.READ); + const memdest = new (datatypeToTypedArray(params.datatype))( + mappableMemdestBuffer.getMappedRange().slice() + ); + mappableMemdestBuffer.unmap(); + + console.log("output array", memdest); + + if (primitive.validate) { + const errorstr = primitive.validate({ + inputBuffer: memsrc, + outputBuffer: memdest, + }); + + if (errorstr === "") { + returnStr += `Validation passed (input length: ${inputLength})
`; + } else { + returnStr += `Validation failed (input length: ${inputLength})
${errorstr}
`; + } + } else { + returnStr += `Validation not performed (input length: ${inputLength})
`; + } + } /* end loop over input lengths */ + plotResults(results); + return returnStr; +} + +function plotResults(results) { + console.log(results); + const plots = [ + { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "bandwidth", label: "Bandwidth (GB/s)" }, + stroke: { field: "timing" }, + caption: + "BANDWIDTH | CPU timing (performance.now), GPU timing (timestamps)", + }, + { + x: { field: "inputBytes", label: "Input array size (B)" }, + y: { field: "time", label: "Runtime (ns)" }, + stroke: { field: "timing" }, + caption: + "RUNTIME | CPU timing (performance.now), GPU timing (timestamps)", + }, + ]; + for (let plot of plots) { + const mark = plot.mark ?? "lineY"; + const schema = { + marks: [ + Plot[mark](results, { + x: plot.x.field, + y: plot.y.field, + ...("fx" in plot && { fx: plot.fx.field }), + ...("fy" in plot && { fy: plot.fy.field }), + ...("stroke" in plot && { + stroke: plot.stroke.field, + }), + tip: true, + }), + Plot.text( + results, + Plot.selectLast({ + x: plot.x.field, + y: plot.y.field, + ...("stroke" in plot && { + z: plot.stroke.field, + text: plot.stroke.field, + }), + ...("fx" in plot && { fx: plot.fx.field }), + ...("fy" in plot && { fy: plot.fy.field }), + textAnchor: "start", + clip: false, + dx: 3, + }) + ), + Plot.text([plot.text_tl ?? ""], { + lineWidth: 30, + dx: 5, + frameAnchor: "top-left", + }), + Plot.text(plot.text_br ?? "", { + lineWidth: 30, + dx: 5, + frameAnchor: "bottom-right", + }), + ], + x: { type: "log", label: plot?.x?.label ?? "XLABEL" }, + y: { type: "log", label: plot?.y?.label ?? "YLABEL" }, + ...("fx" in plot && { + fx: { label: plot.fx.label }, + }), + ...("fy" in plot && { + fy: { label: plot.fy.label }, + }), + ...(("fx" in plot || "fy" in plot) && { grid: true }), + color: { type: "ordinal", legend: true }, + width: 1280, + title: plot?.title, + subtitle: plot?.subtitle, + caption: plot?.caption, + }; + const plotted = Plot.plot(schema); + const div = document.querySelector("#plot"); + div.append(plotted); + div.append(document.createElement("hr")); + } +} \ No newline at end of file