From a252d9dd7309613bcfbcf88222df63ded32994b6 Mon Sep 17 00:00:00 2001 From: alan-baker Date: Fri, 29 Nov 2024 15:32:38 -0500 Subject: [PATCH] Rework subgroup broadcast tests (#4069) * Now tests subgroupBroadcast and subgroupBroadcastFirst * Reuses shared utility code * Better data type tests * Added fragment tests * Broadcast first specific tests * Predicated tests --- src/webgpu/listing_meta.json | 3 + .../call/builtin/subgroupBroadcast.spec.ts | 704 ++++++++++++++---- 2 files changed, 553 insertions(+), 154 deletions(-) diff --git a/src/webgpu/listing_meta.json b/src/webgpu/listing_meta.json index acce2b1e2465..6b3e8a361e3d 100644 --- a/src/webgpu/listing_meta.json +++ b/src/webgpu/listing_meta.json @@ -1568,6 +1568,9 @@ "webgpu:shader,execution,expression,call,builtin,subgroupBitwise:data_types:*": { "subcaseMS": 5081.792 }, "webgpu:shader,execution,expression,call,builtin,subgroupBitwise:fragment,all_active:*": { "subcaseMS": 9079.446 }, "webgpu:shader,execution,expression,call,builtin,subgroupBitwise:fragment,split:*": { "subcaseMS": 0.347 }, + "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:broadcastFirst,split:*": { "subcaseMS": 1974.247 }, + "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:compute,all_active:*": { "subcaseMS": 163.147 }, + "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:compute,split:*": { "subcaseMS": 2659.973 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:data_types:*": { "subcaseMS": 252.374 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:fragment:*": { "subcaseMS": 0.108 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:workgroup_uniform_load:*": { "subcaseMS": 109.832 }, diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts index d62abb830328..5c9650778a68 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts @@ -1,5 +1,5 @@ export const description = ` -Execution tests for subgroupBroadcast +Execution tests for subgroupBroadcast and subgroupBroadcastFirst Note: There is a lack of portability for non-uniform execution so these tests restrict themselves to uniform control flow. @@ -10,61 +10,100 @@ local_invocation_index. Tests should avoid assuming there is. import { makeTestGroup } from '../../../../../../common/framework/test_group.js'; import { keysOf, objectsToRecord } from '../../../../../../common/util/data_tables.js'; import { iterRange } from '../../../../../../common/util/util.js'; -import { GPUTest } from '../../../../../gpu_test.js'; import { kConcreteNumericScalarsAndVectors, Type, VectorType, - elementTypeOf, } from '../../../../../util/conversion.js'; -export const g = makeTestGroup(GPUTest); +import { + kWGSizes, + runComputeTest, + kDataSentinel, + generateTypedInputs, + kPredicateCases, + runFragmentTest, + getUintsPerFramebuffer, + SubgroupTest, + kFramebufferSizes, +} from './subgroup_util.js'; + +export const g = makeTestGroup(SubgroupTest); const kDataTypes = objectsToRecord(kConcreteNumericScalarsAndVectors); -const kWGSizes = [ - [4, 1, 1], - [8, 1, 1], - [16, 1, 1], - [32, 1, 1], - [64, 1, 1], - [128, 1, 1], - [256, 1, 1], - [1, 4, 1], - [1, 8, 1], - [1, 16, 1], - [1, 32, 1], - [1, 64, 1], - [1, 128, 1], - [1, 256, 1], - [1, 1, 4], - [1, 1, 8], - [1, 1, 16], - [1, 1, 32], - [1, 1, 64], - [3, 3, 3], - [4, 4, 4], - [16, 16, 1], - [16, 1, 16], - [1, 16, 16], - [15, 3, 3], - [3, 15, 3], - [3, 3, 15], -] as const; +/** + * Checks the results of the data types test + * + * The outputs for a given index are expected to match the input values + * for the given broadcast. + * @param metadata An unused parameter + * @param output The output data + * @param id The broadcast id + * @param type The data type + */ +function checkDataTypes( + metadata: Uint32Array, + output: Uint32Array, + input: Uint32Array, + id: number, + type: Type +): Error | undefined { + if (type.requiresF16() && !(type instanceof VectorType)) { + for (let i = 0; i < 4; i++) { + const expectIdx = Math.floor(id / 2); + const expectShift = id % 2 === 1; + let expect = input[expectIdx]; + if (expectShift) { + expect >>= 16; + } + expect &= 0xffff; + + const resIdx = Math.floor(i / 2); + const resShift = i % 2 === 1; + let res = output[resIdx]; + if (resShift) { + res >>= 16; + } + res &= 0xffff; + + if (res !== expect) { + return new Error(`${i}: incorrect result +- expected: ${expect} +- got: ${res}`); + } + } + } else { + let uints = 1; + if (type instanceof VectorType) { + uints = type.width === 3 ? 4 : type.width; + if (type.requiresF16()) { + uints = Math.floor(uints / 2); + } + } + for (let i = 0; i < 4; i++) { + for (let j = 0; j < uints; j++) { + const expect = input[id * uints + j]; + const res = output[i * uints + j]; + if (res !== expect) { + return new Error(`${uints * i + j}: incorrect result +- expected: ${expect} +- got: ${res}`); + } + } + } + } + + return undefined; +} g.test('data_types') .desc('Tests broadcast of data types') .params(u => u .combine('type', keysOf(kDataTypes)) - .filter(t => { - // Skip vec3h for simplicity - const type = kDataTypes[t.type]; - return type !== Type['vec3h']; - }) .beginSubcases() .combine('id', [0, 1, 2, 3] as const) - .combine('wgSize', kWGSizes) ) .beforeAllSubcases(t => { const features: GPUFeatureName[] = ['subgroups' as GPUFeatureName]; @@ -76,139 +115,58 @@ g.test('data_types') t.selectDeviceOrSkipTestCase(features); }) .fn(async t => { + const wgSize = [4, 1, 1]; const type = kDataTypes[t.params.type]; let enables = 'enable subgroups;\n'; if (type.requiresF16()) { enables += 'enable f16;\nenable subgroups_f16;\n'; } - // Compatibility mode has lower workgroup limits. - const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; - const { - maxComputeInvocationsPerWorkgroup, - maxComputeWorkgroupSizeX, - maxComputeWorkgroupSizeY, - maxComputeWorkgroupSizeZ, - } = t.device.limits; - t.skipIf( - maxComputeInvocationsPerWorkgroup < wgThreads || - maxComputeWorkgroupSizeX < t.params.wgSize[0] || - maxComputeWorkgroupSizeY < t.params.wgSize[1] || - maxComputeWorkgroupSizeZ < t.params.wgSize[2], - 'Workgroup size too large' - ); - - let outputType: Type = Type.u32; - if (type instanceof VectorType) { - outputType = Type['vec'](type.width, outputType); - } - - let updates = `_ = atomicAdd(&output[b], 1);`; - if (type instanceof VectorType) { - updates = ``; - for (let i = 0; i < type.width; i++) { - updates += `_ = atomicAdd(&output[b[${i}]], 1);\n`; - } - } + const broadcast = + t.params.id === 0 + ? `subgroupBroadcastFirst(input[id])` + : `subgroupBroadcast(input[id], ${t.params.id})`; - // This test should be expanded to cover large input values for each type instead of just conversions to u32. const wgsl = ` ${enables} @group(0) @binding(0) -var size : u32; +var input : array<${type.toString()}>; @group(0) @binding(1) -var output : array>; +var output : array<${type.toString()}>; -@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) -fn main(@builtin(local_invocation_index) lid : u32, - @builtin(subgroup_invocation_id) id : u32, - @builtin(subgroup_size) subgroupSize : u32) { - let scalar = ${elementTypeOf(type).toString()}(id); - let v = ${type.toString()}(scalar); - let b = ${outputType.toString()}(subgroupBroadcast(v, ${t.params.id})); - ${updates} - if (lid == 0) { - size = subgroupSize; - } -}`; - - const sizeBuffer = t.makeBufferWithContents( - new Uint32Array([0]), - GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE - ); - t.trackForCleanup(sizeBuffer); +@group(0) @binding(2) +var metadata : array; // unused - const outputNumInts = wgThreads; - const outputBuffer = t.makeBufferWithContents( - new Uint32Array([...iterRange(outputNumInts, x => 0)]), - GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE - ); - t.trackForCleanup(outputBuffer); +@compute @workgroup_size(${wgSize[0]}, ${wgSize[1]}, ${wgSize[2]}) +fn main( + @builtin(subgroup_invocation_id) id : u32, +) { + // Force usage. + _ = metadata[0]; - const pipeline = t.device.createComputePipeline({ - layout: 'auto', - compute: { - module: t.device.createShaderModule({ - code: wgsl, - }), - entryPoint: 'main', - }, - }); - const bg = t.device.createBindGroup({ - layout: pipeline.getBindGroupLayout(0), - entries: [ - { - binding: 0, - resource: { - buffer: sizeBuffer, - }, - }, - { - binding: 1, - resource: { - buffer: outputBuffer, - }, - }, - ], - }); - - const encoder = t.device.createCommandEncoder(); - const pass = encoder.beginComputePass(); - pass.setPipeline(pipeline); - pass.setBindGroup(0, bg); - pass.dispatchWorkgroups(1, 1, 1); - pass.end(); - t.queue.submit([encoder.finish()]); - - const sizeReadback = await t.readGPUBufferRangeTyped(sizeBuffer, { - srcByteOffset: 0, - type: Uint32Array, - typedLength: 1, - method: 'copy', - }); - const subgroupSize = sizeReadback.data[0]; + output[id] = ${broadcast}; +}`; - let width = 1; + const inputData = generateTypedInputs(type); + let uintsPerOutput = 1; if (type instanceof VectorType) { - width = type.width; - } - - const expect = Array(wgThreads); - expect.fill(0); - - const numFullSubgroups = Math.floor(wgThreads / subgroupSize); - const partialSize = wgThreads % subgroupSize; - const numSubgroups = Math.ceil(wgThreads / subgroupSize); - for (let i = 0; i < numSubgroups; i++) { - if (i < numFullSubgroups) { - expect[t.params.id] += width * subgroupSize; - } else { - expect[t.params.id] += width * partialSize; + uintsPerOutput = type.width === 3 ? 4 : type.width; + if (type.requiresF16()) { + uintsPerOutput = Math.floor(uintsPerOutput / 2); } } - t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expect)); + await runComputeTest( + t, + wgsl, + wgSize, + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkDataTypes(metadata, output, inputData, t.params.id, type); + } + ); }); g.test('workgroup_uniform_load') @@ -218,6 +176,7 @@ g.test('workgroup_uniform_load') .combine('wgSize', kWGSizes) .beginSubcases() .combine('inputId', [1, 2, 3] as const) + .combine('first', [false, true] as const) ) .beforeAllSubcases(t => { t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); @@ -239,6 +198,8 @@ g.test('workgroup_uniform_load') 'Workgroup size too large' ); + const broadcast = t.params.first ? `subgroupBroadcastFirst(v)` : `subgroupBroadcast(v, 0)`; + const wgsl = ` enable subgroups; @@ -263,7 +224,7 @@ fn main(@builtin(subgroup_invocation_id) id : u32, if (id == 0) { v = wgmem; } - v = subgroupBroadcast(v, 0); + v = ${broadcast}; output[lid] = v; }`; @@ -320,4 +281,439 @@ fn main(@builtin(subgroup_invocation_id) id : u32, t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expect)); }); -g.test('fragment').unimplemented(); +/** + * Checks the results of broadcast in compute shaders. + * + * @param metadata An array of uint32s + * * first half is subgroup_invocation_id + * * second half is subgroup_size + * @param output An array uint32s containing the broadcast results + * @param numInvs The number of invocations + * @param broadcast The broadcast invocation (or 'first' to indicate the lowest active) + * @param filter A functor indicating whether the invocation participates in the broadcast + */ +function checkCompute( + metadata: Uint32Array, + output: Uint32Array, + numInvs: number, + broadcast: number | 'first', + filter: (id: number, size: number) => boolean +): Error | undefined { + let broadcastedId = broadcast; + if (broadcast === 'first') { + // Subgroup size is uniform in compute shaders so any will do. + const size = metadata[numInvs]; + for (let i = 0; i < size; i++) { + if (filter(i, size)) { + broadcastedId = i; + break; + } + } + } + + const mapping = new Map(); + const sizes = new Map(); + for (let i = 0; i < numInvs; i++) { + const id = metadata[i]; + const size = metadata[i + numInvs]; + + const res = output[i]; + + if (filter(id, size)) { + let seen = mapping.get(res) ?? 0; + seen++; + mapping.set(res, seen); + + if (broadcastedId === id) { + sizes.set(res, size); + if (res !== i) { + return new Error(`Invocation ${i}: incorrect result: +- expected: ${i} +- got: ${res}`); + } + } + } else { + if (res !== kDataSentinel) { + return new Error(`Invocation ${i}: unexpected write (${res})`); + } + } + } + + for (const [key, value] of mapping) { + const id = Number(key); + const seen = Number(value); + const size = sizes.get(id) ?? 0; + if (size < seen) { + return new Error(`Unexpected number of invocations for subgroup ${id} +- expected: ${size} +- got: ${seen}`); + } + } + + return undefined; +} + +g.test('compute,all_active') + .desc('Test broadcasts in compute shaders with all active invocations') + .params(u => + u + .combine('wgSize', kWGSizes) + .beginSubcases() + // Only values < 4 are used because it is a dynamic error to broadcast an inactive invocation. + .combine('id', [0, 1, 2, 3] as const) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const broadcast = + t.params.id === 0 + ? `subgroupBroadcastFirst(input[lid])` + : `subgroupBroadcast(input[lid], ${t.params.id})`; + + const wgsl = ` +enable subgroups; + +@group(0) @binding(0) +var input : array; + +struct Metadata { + id : array, + size : array, +} + +@group(0) @binding(1) +var output : array; + +@group(0) @binding(2) +var metadata: Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.id[lid] = id; + metadata.size[lid] = subgroupSize; + + output[lid] = ${broadcast}; +}`; + + const inputData = new Uint32Array([...iterRange(wgThreads, x => x)]); + const uintsPerOutput = 1; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkCompute( + metadata, + output, + wgThreads, + t.params.id, + (id: number, size: number) => { + return true; + } + ); + } + ); + }); + +g.test('compute,split') + .desc(`Test broadcasts with only some active invocations`) + .params(u => + u + .combine('predicate', keysOf(kPredicateCases)) + .filter(t => { + // This case would be UB + return t.predicate !== 'upper_half'; + }) + .beginSubcases() + .combine('id', [0, 1, 2, 3] as const) + .combine('wgSize', kWGSizes) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const testcase = kPredicateCases[t.params.predicate]; + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + interface SubgroupLimits extends GPUSupportedLimits { + minSubgroupSize: number; + maxSubgroupSize: number; + } + const { minSubgroupSize, maxSubgroupSize } = t.device.limits as SubgroupLimits; + for (let size = minSubgroupSize; size <= maxSubgroupSize; size *= 2) { + t.skipIf(!testcase.filter(t.params.id, size), 'Skipping potential undefined behavior'); + } + + const broadcast = + t.params.id === 0 + ? `subgroupBroadcastFirst(input[lid])` + : `subgroupBroadcast(input[lid], ${t.params.id})`; + + const wgsl = ` +enable subgroups; +diagnostic(off, subgroup_uniformity); + +@group(0) @binding(0) +var input : array; + +struct Metadata { + id : array, + size : array, +} + +@group(0) @binding(1) +var output : array; + +@group(0) @binding(2) +var metadata: Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.id[lid] = id; + metadata.size[lid] = subgroupSize; + + if ${testcase.cond} { + output[lid] = ${broadcast}; + } else { + return; + } +}`; + + const inputData = new Uint32Array([...iterRange(wgThreads, x => x)]); + const uintsPerOutput = 1; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkCompute(metadata, output, wgThreads, t.params.id, testcase.filter); + } + ); + }); + +g.test('broadcastFirst,split') + .desc(`Test broadcastFirst with only some active invocations`) + .params(u => + u.combine('predicate', keysOf(kPredicateCases)).beginSubcases().combine('wgSize', kWGSizes) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const testcase = kPredicateCases[t.params.predicate]; + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const wgsl = ` +enable subgroups; +diagnostic(off, subgroup_uniformity); + +@group(0) @binding(0) +var input : array; + +struct Metadata { + id : array, + size : array, +} + +@group(0) @binding(1) +var output : array; + +@group(0) @binding(2) +var metadata: Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.id[lid] = id; + metadata.size[lid] = subgroupSize; + + if ${testcase.cond} { + output[lid] = subgroupBroadcastFirst(input[lid]); + } else { + return; + } +}`; + + const inputData = new Uint32Array([...iterRange(wgThreads, x => x)]); + const uintsPerOutput = 1; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkCompute(metadata, output, wgThreads, 'first', testcase.filter); + } + ); + }); + +/** + * Check broadcasts in fragment shaders + * + * Only checks subgroups where no invocation is in the last row + * or column to avoid helper invocations. + * @param data The framebuffer output + * * component 0 is the broadcast result + * * component 1 is the subgroup_invocation_id + * * component 2 is the subgroup_size + * @param format The framebuffer format + * @param width The framebuffer width + * @param height The framebuffer height + * @param broadcast The id being broadcast + */ +function checkFragment( + data: Uint32Array, + format: GPUTextureFormat, + width: number, + height: number, + broadcast: number +): Error | undefined { + const { uintsPerRow, uintsPerTexel } = getUintsPerFramebuffer(format, width, height); + + const coordToIndex = (row: number, col: number) => { + return uintsPerRow * row + col * uintsPerTexel; + }; + + const inBounds = new Map(); + for (let row = 0; row < height; row++) { + for (let col = 0; col < width; col++) { + const offset = coordToIndex(row, col); + + const res = data[offset]; + + let bound = inBounds.get(res) ?? true; + bound = bound && row < height - 1 && col < height - 1; + inBounds.set(res, bound); + } + } + + const seen = new Map(); + const sizes = new Map(); + for (let row = 0; row < height; row++) { + for (let col = 0; col < width; col++) { + const offset = coordToIndex(row, col); + + const res = data[offset]; + const bound = inBounds.get(res) ?? true; + if (!bound) { + continue; + } + + const id = data[offset + 1]; + const size = data[offset + 2]; + + let s = seen.get(res) ?? 0; + s++; + seen.set(res, s); + + if (id === broadcast) { + const linear = row * width + col; + if (res !== linear) { + return new Error(`Row ${row}, col ${col}: incorrect broadcast +- expected: ${linear} +- got: ${res}`); + } + + sizes.set(res, size); + } + } + } + + for (const [key, value] of inBounds) { + const id = Number(key); + const ok = Boolean(value); + if (ok) { + const size = sizes.get(id) ?? 0; + const seen = sizes.get(id) ?? 0; + if (size < seen) { + return new Error(`Unexpected number of invocations for subgroup ${id} +- expected: ${size} +- got: ${seen}`); + } + } + } + + return undefined; +} + +g.test('fragment') + .desc('Test broadcast in fragment shaders') + .params(u => + u + .combine('size', kFramebufferSizes) + .beginSubcases() + .combine('id', [0, 1, 2, 3] as const) + .combineWithParams([{ format: 'rgba32uint' }] as const) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const innerTexels = (t.params.size[0] - 1) * (t.params.size[1] - 1); + interface SubgroupLimits extends GPUSupportedLimits { + maxSubgroupSize: number; + } + const { maxSubgroupSize } = t.device.limits as SubgroupLimits; + t.skipIf(innerTexels < maxSubgroupSize, 'Too few texels to be reliable'); + + const broadcast = + t.params.id === 0 + ? `subgroupBroadcastFirst(input[linear])` + : `subgroupBroadcast(input[linear], ${t.params.id})`; + + const fsShader = ` +enable subgroups; + +@group(0) @binding(0) +var input : array; + +@fragment +fn main( + @builtin(position) pos : vec4f, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) size : u32, +) -> @location(0) vec4u { + let linear = u32(pos.x) + u32(pos.y) * ${t.params.size[0]}; + + return vec4u(${broadcast}, id, size, linear); +}`; + + const texels = t.params.size[0] * t.params.size[1]; + const inputData = new Uint32Array([...iterRange(texels, x => x)]); + await runFragmentTest( + t, + t.params.format, + fsShader, + t.params.size[0], + t.params.size[1], + inputData, + (data: Uint32Array) => { + return checkFragment( + data, + t.params.format, + t.params.size[0], + t.params.size[1], + t.params.id + ); + } + ); + });