From 9b30f7a02d172ce36d138c02614d0a5a1edbfa72 Mon Sep 17 00:00:00 2001 From: alan-baker Date: Fri, 30 Aug 2024 15:00:10 -0400 Subject: [PATCH] Execution tests for subgroupAny and subgroupAll (#3924) * Compute tests with all active invocation and partially active invocations * Removed unimplemented dynamically uniform subgroupBroadcast test (due to const requirement) --- src/webgpu/listing_meta.json | 7 +- .../call/builtin/subgroupAll.spec.ts | 265 ++++++++++++++++++ .../call/builtin/subgroupAny.spec.ts | 265 ++++++++++++++++++ .../call/builtin/subgroupBroadcast.spec.ts | 2 - .../expression/call/builtin/subgroup_util.ts | 10 +- 5 files changed, 543 insertions(+), 6 deletions(-) create mode 100644 src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts create mode 100644 src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts diff --git a/src/webgpu/listing_meta.json b/src/webgpu/listing_meta.json index 0290bae1808a..9e73f590e65c 100644 --- a/src/webgpu/listing_meta.json +++ b/src/webgpu/listing_meta.json @@ -1529,13 +1529,18 @@ "webgpu:shader,execution,expression,call,builtin,subgroupAdd:data_types:*": { "subcaseMS": 9216.247 }, "webgpu:shader,execution,expression,call,builtin,subgroupAdd:fp_accuracy:*": { "subcaseMS": 9952.350 }, "webgpu:shader,execution,expression,call,builtin,subgroupAdd:fragment:*": { "subcaseMS": 0.229 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,all_active:*": { "subcaseMS": 5162.414 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,split:*": { "subcaseMS": 26610.627 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAll:fragment:*": { "subcaseMS": 0.172 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,all_active:*": { "subcaseMS": 7028.394 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,split:*": { "subcaseMS": 50.998 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAny:fragment:*": { "subcaseMS": 0.227 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:compute,split:*": { "subcaseMS": 38.740 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment,split:*": { "subcaseMS": 0.331 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment:*": { "subcaseMS": 0.059 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:predicate:*": { "subcaseMS": 0.075 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:predicate_and_control_flow:*": { "subcaseMS": 41.053 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:data_types:*": { "subcaseMS": 252.374 }, - "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:dynamically_uniform_id:*": { "subcaseMS": 0.211 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:fragment:*": { "subcaseMS": 0.108 }, "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:workgroup_uniform_load:*": { "subcaseMS": 109.832 }, "webgpu:shader,execution,expression,call,builtin,subgroupMul:compute,split:*": { "subcaseMS": 5034.263 }, diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts new file mode 100644 index 000000000000..831c33d8ed1c --- /dev/null +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts @@ -0,0 +1,265 @@ +export const description = ` +Execution tests for subgroupAll. + +Note: There is a lack of portability for non-uniform execution so these tests +restrict themselves to uniform control flow. +Note: There is no guaranteed mapping between subgroup_invocation_id and +local_invocation_index. Tests should avoid assuming there is. +`; + +import { makeTestGroup } from '../../../../../../common/framework/test_group.js'; +import { keysOf } from '../../../../../../common/util/data_tables.js'; +import { iterRange } from '../../../../../../common/util/util.js'; +import { PRNG } from '../../../../../util/prng.js'; + +import { + kWGSizes, + kPredicateCases, + SubgroupTest, + runComputeTest, + kDataSentinel, +} from './subgroup_util.js'; + +export const g = makeTestGroup(SubgroupTest); + +const kNumCases = 15; + +/** + * Generate input data for testing. + * + * Data is generated in the following categories: + * Seed 0 generates all 0 data + * Seed 1 generates all 1 data + * Seeds 2-9 generates all 1s except for a zero randomly once per 32 elements + * Seeds 10+ generate all random data + * @param seed The seed for the PRNG + * @param num The number of data items to generate + * @param addCounter If true, treats the first index as an atomic counter + */ +function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array { + const prng = new PRNG(seed); + + const bound = Math.min(num, 32); + const index = prng.uniformInt(bound); + + return new Uint32Array([ + ...iterRange(num, x => { + if (addCounter && x === 0) { + // Counter should start at 1 to avoid clear value. + return 1; + } + + if (seed === 0) { + return 0; + } else if (seed === 1) { + return 1; + } else if (seed < 10) { + const bounded = (addCounter ? x + 1 : x) % bound; + return bounded === index ? 0 : 1; + } + return prng.uniformInt(2); + }), + ]); +} + +/** + * Checks the result of a subgroupAll operation + * + * Since subgroup size depends on the pipeline compile, we calculate the expected + * results after execution. The shader generates a subgroup id and records it for + * each invocation. The check first calculates the expected result for each subgroup + * and then compares to the actual result for each invocation. The filter functor + * ensures only the correct invocations contribute to the calculation. + * @param metadata An array of uints: + * * first half containing subgroup sizes (from builtin value) + * * second half subgroup invocation id + * @param output An array of uints containing: + * * first half is the outputs of subgroupAll + * * second half is a generated subgroup id + * @param numInvs Number of invocations executed + * @param input The input data (equal size to output) + * @param filter A functor to filter active invocations + */ +function checkAll( + metadata: Uint32Array, // unused + output: Uint32Array, + numInvs: number, + input: Uint32Array, + filter: (id: number, size: number) => boolean +): Error | undefined { + // First, generate expected results. + const expected = new Map(); + for (let inv = 0; inv < numInvs; inv++) { + const size = metadata[inv]; + const id = metadata[inv + numInvs]; + if (!filter(id, size)) { + continue; + } + const subgroup_id = output[numInvs + inv]; + let v = expected.get(subgroup_id) ?? 1; + v &= input[inv]; + expected.set(subgroup_id, v); + } + + // Second, check against actual results. + for (let inv = 0; inv < numInvs; inv++) { + const size = metadata[inv]; + const id = metadata[inv + numInvs]; + const res = output[inv]; + if (filter(id, size)) { + const subgroup_id = output[numInvs + inv]; + const expected_v = expected.get(subgroup_id) ?? 0; + if (expected_v !== res) { + return new Error(`Invocation ${inv}: +- expected: ${expected_v} +- got: ${res}`); + } + } else { + if (res !== kDataSentinel) { + return new Error(`Invocation ${inv} unexpected write: +- subgroup invocation id: ${id} +- subgroup size: ${size}`); + } + } + } + + return undefined; +} + +g.test('compute,all_active') + .desc(`Test compute subgroupAll`) + .params(u => + u + .combine('wgSize', kWGSizes) + .beginSubcases() + .combine('case', [...iterRange(kNumCases, x => x)]) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const wgsl = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@group(0) @binding(1) +var outputs : array; + +struct Metadata { + subgroup_size: array, + subgroup_invocation_id: array, +} + +@group(0) @binding(2) +var metadata : Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.subgroup_size[lid] = subgroupSize; + + metadata.subgroup_invocation_id[lid] = id; + + // Record a representative subgroup id. + outputs[lid + ${wgThreads}] = subgroupBroadcastFirst(lid); + + let res = select(0u, 1u, subgroupAll(bool(inputs[lid]))); + outputs[lid] = res; +}`; + + const includeCounter = false; + const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + + const uintsPerOutput = 2; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkAll(metadata, output, wgThreads, inputData, (id: number, size: number) => { + return true; + }); + } + ); + }); + +g.test('compute,split') + .desc('Test that only active invocation participate') + .params(u => + u + .combine('predicate', keysOf(kPredicateCases)) + .beginSubcases() + .combine('wgSize', kWGSizes) + .combine('case', [...iterRange(kNumCases, x => x)]) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const testcase = kPredicateCases[t.params.predicate]; + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const wgsl = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@group(0) @binding(1) +var outputs : array; + +struct Metadata { + subgroup_size : array, + subgroup_invocation_id : array, +} + +@group(0) @binding(2) +var metadata : Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.subgroup_size[lid] = subgroupSize; + + // Record subgroup invocation id for this invocation. + metadata.subgroup_invocation_id[lid] = id; + + // Record a generated subgroup id. + outputs[${wgThreads} + lid] = subgroupBroadcastFirst(lid); + + if ${testcase.cond} { + outputs[lid] = select(0u, 1u, subgroupAll(bool(inputs[lid]))); + } else { + return; + } +}`; + + const includeCounter = false; + const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + + const uintsPerOutput = 2; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkAll(metadata, output, wgThreads, inputData, testcase.filter); + } + ); + }); + +g.test('fragment').unimplemented(); diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts new file mode 100644 index 000000000000..6418eb141dc5 --- /dev/null +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts @@ -0,0 +1,265 @@ +export const description = ` +Execution tests for subgroupAny. + +Note: There is a lack of portability for non-uniform execution so these tests +restrict themselves to uniform control flow. +Note: There is no guaranteed mapping between subgroup_invocation_id and +local_invocation_index. Tests should avoid assuming there is. +`; + +import { makeTestGroup } from '../../../../../../common/framework/test_group.js'; +import { keysOf } from '../../../../../../common/util/data_tables.js'; +import { iterRange } from '../../../../../../common/util/util.js'; +import { PRNG } from '../../../../../util/prng.js'; + +import { + kWGSizes, + kPredicateCases, + SubgroupTest, + runComputeTest, + kDataSentinel, +} from './subgroup_util.js'; + +export const g = makeTestGroup(SubgroupTest); + +const kNumCases = 15; + +/** + * Generate input data for testing. + * + * Data is generated in the following categories: + * Seed 0 generates all 0 data + * Seed 1 generates all 1 data + * Seeds 2-9 generates all 0s except for a one randomly once per 32 elements + * Seeds 10+ generate all random data + * @param seed The seed for the PRNG + * @param num The number of data items to generate + * @param addCounter If true, treats the first index as an atomic counter + */ +function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array { + const prng = new PRNG(seed); + + const bound = Math.min(num, 32); + const index = prng.uniformInt(bound); + + return new Uint32Array([ + ...iterRange(num, x => { + if (addCounter && x === 0) { + // Counter should start at 1 to avoid clear value. + return 1; + } + + if (seed === 0) { + return 0; + } else if (seed === 1) { + return 1; + } else if (seed < 10) { + const bounded = (addCounter ? x + 1 : x) % bound; + return bounded === index ? 1 : 0; + } + return prng.uniformInt(2); + }), + ]); +} + +/** + * Checks the result of a subgroupAny operation + * + * Since subgroup size depends on the pipeline compile, we calculate the expected + * results after execution. The shader generates a subgroup id and records it for + * each invocation. The check first calculates the expected result for each subgroup + * and then compares to the actual result for each invocation. The filter functor + * ensures only the correct invocations contribute to the calculation. + * @param metadata An array of uints: + * * first half containing subgroup sizes (from builtin value) + * * second half subgroup invocation id + * @param output An array of uints containing: + * * first half is the outputs of subgroupAny + * * second half is a generated subgroup id + * @param numInvs Number of invocations executed + * @param input The input data (equal size to output) + * @param filter A functor to filter active invocations + */ +function checkAny( + metadata: Uint32Array, // unused + output: Uint32Array, + numInvs: number, + input: Uint32Array, + filter: (id: number, size: number) => boolean +): Error | undefined { + // First, generate expected results. + const expected = new Map(); + for (let inv = 0; inv < numInvs; inv++) { + const size = metadata[inv]; + const id = metadata[inv + numInvs]; + if (!filter(id, size)) { + continue; + } + const subgroup_id = output[numInvs + inv]; + let v = expected.get(subgroup_id) ?? 0; + v |= input[inv]; + expected.set(subgroup_id, v); + } + + // Second, check against actual results. + for (let inv = 0; inv < numInvs; inv++) { + const size = metadata[inv]; + const id = metadata[inv + numInvs]; + const res = output[inv]; + if (filter(id, size)) { + const subgroup_id = output[numInvs + inv]; + const expected_v = expected.get(subgroup_id) ?? 0; + if (expected_v !== res) { + return new Error(`Invocation ${inv}: +- expected: ${expected_v} +- got: ${res}`); + } + } else { + if (res !== kDataSentinel) { + return new Error(`Invocation ${inv} unexpected write: +- subgroup invocation id: ${id} +- subgroup size: ${size}`); + } + } + } + + return undefined; +} + +g.test('compute,all_active') + .desc(`Test compute subgroupAny`) + .params(u => + u + .combine('wgSize', kWGSizes) + .beginSubcases() + .combine('case', [...iterRange(kNumCases, x => x)]) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const wgsl = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@group(0) @binding(1) +var outputs : array; + +struct Metadata { + subgroup_size: array, + subgroup_invocation_id: array, +} + +@group(0) @binding(2) +var metadata : Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.subgroup_size[lid] = subgroupSize; + + metadata.subgroup_invocation_id[lid] = id; + + // Record a representative subgroup id. + outputs[lid + ${wgThreads}] = subgroupBroadcastFirst(lid); + + let res = select(0u, 1u, subgroupAny(bool(inputs[lid]))); + outputs[lid] = res; +}`; + + const includeCounter = false; + const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + + const uintsPerOutput = 2; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkAny(metadata, output, wgThreads, inputData, (id: number, size: number) => { + return true; + }); + } + ); + }); + +g.test('compute,split') + .desc('Test that only active invocation participate') + .params(u => + u + .combine('predicate', keysOf(kPredicateCases)) + .beginSubcases() + .combine('wgSize', kWGSizes) + .combine('case', [...iterRange(kNumCases, x => x)]) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const testcase = kPredicateCases[t.params.predicate]; + const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2]; + + const wgsl = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@group(0) @binding(1) +var outputs : array; + +struct Metadata { + subgroup_size : array, + subgroup_invocation_id : array, +} + +@group(0) @binding(2) +var metadata : Metadata; + +@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]}) +fn main( + @builtin(local_invocation_index) lid : u32, + @builtin(subgroup_invocation_id) id : u32, + @builtin(subgroup_size) subgroupSize : u32, +) { + metadata.subgroup_size[lid] = subgroupSize; + + // Record subgroup invocation id for this invocation. + metadata.subgroup_invocation_id[lid] = id; + + // Record a generated subgroup id. + outputs[${wgThreads} + lid] = subgroupBroadcastFirst(lid); + + if ${testcase.cond} { + outputs[lid] = select(0u, 1u, subgroupAny(bool(inputs[lid]))); + } else { + return; + } +}`; + + const includeCounter = false; + const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + + const uintsPerOutput = 2; + await runComputeTest( + t, + wgsl, + [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]], + uintsPerOutput, + inputData, + (metadata: Uint32Array, output: Uint32Array) => { + return checkAny(metadata, output, wgThreads, inputData, testcase.filter); + } + ); + }); + +g.test('fragment').unimplemented(); diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts index b2fa9e46ec7a..75fe27e8cb5d 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts @@ -318,6 +318,4 @@ fn main(@builtin(subgroup_invocation_id) id : u32, t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expect)); }); -g.test('dynamically_uniform_id').unimplemented(); - g.test('fragment').unimplemented(); diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts index 60cf9f12a155..8749c136c3d2 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts @@ -1,10 +1,12 @@ import { assert, iterRange } from '../../../../../../common/util/util.js'; import { Float16Array } from '../../../../../../external/petamoriken/float16/float16.js'; -import { GPUTest } from '../../../../../gpu_test.js'; +import { GPUTest, TextureTestMixin } from '../../../../../gpu_test.js'; import { FPInterval } from '../../../../../util/floating_point.js'; import { sparseScalarF16Range, sparseScalarF32Range } from '../../../../../util/math.js'; import { PRNG } from '../../../../../util/prng.js'; +export class SubgroupTest extends TextureTestMixin(GPUTest) {} + export const kNumCases = 1000; export const kStride = 128; @@ -300,6 +302,8 @@ fn main( t.expectOK(checkAccuracy(metadata, output, [idx1, idx2], [val1, val2], identity, intervalGen)); } +export const kDataSentinel = 999; + /** * Runs compute shader subgroup test * @@ -346,14 +350,14 @@ export async function runComputeTest( const outputUints = outputUintsPerElement * wgThreads; const outputBuffer = t.makeBufferWithContents( - new Uint32Array([...iterRange(outputUints, x => 999)]), + new Uint32Array([...iterRange(outputUints, x => kDataSentinel)]), GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE ); t.trackForCleanup(outputBuffer); const numMetadata = 2 * wgThreads; const metadataBuffer = t.makeBufferWithContents( - new Uint32Array([...iterRange(numMetadata, x => 999)]), + new Uint32Array([...iterRange(numMetadata, x => kDataSentinel)]), GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE );