diff --git a/src/webgpu/listing_meta.json b/src/webgpu/listing_meta.json index 9e73f590e65c..af40914773a2 100644 --- a/src/webgpu/listing_meta.json +++ b/src/webgpu/listing_meta.json @@ -1531,10 +1531,12 @@ "webgpu:shader,execution,expression,call,builtin,subgroupAdd:fragment:*": { "subcaseMS": 0.229 }, "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,all_active:*": { "subcaseMS": 5162.414 }, "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,split:*": { "subcaseMS": 26610.627 }, - "webgpu:shader,execution,expression,call,builtin,subgroupAll:fragment:*": { "subcaseMS": 0.172 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAll:fragment,all_active:*": { "subcaseMS": 0.172 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAll:fragment,split:*": { "subcaseMS": 0.327 }, "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,all_active:*": { "subcaseMS": 7028.394 }, "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,split:*": { "subcaseMS": 50.998 }, - "webgpu:shader,execution,expression,call,builtin,subgroupAny:fragment:*": { "subcaseMS": 0.227 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAny:fragment,all_active:*": { "subcaseMS": 0.227 }, + "webgpu:shader,execution,expression,call,builtin,subgroupAny:fragment,split:*": { "subcaseMS": 0.309 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:compute,split:*": { "subcaseMS": 38.740 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment,split:*": { "subcaseMS": 0.331 }, "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment:*": { "subcaseMS": 0.059 }, diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts index 831c33d8ed1c..0aa461c4a578 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts @@ -10,14 +10,18 @@ local_invocation_index. Tests should avoid assuming there is. import { makeTestGroup } from '../../../../../../common/framework/test_group.js'; import { keysOf } from '../../../../../../common/util/data_tables.js'; import { iterRange } from '../../../../../../common/util/util.js'; +import { kTextureFormatInfo } from '../../../../../format_info.js'; +import { align } from '../../../../../util/math.js'; import { PRNG } from '../../../../../util/prng.js'; import { kWGSizes, kPredicateCases, SubgroupTest, - runComputeTest, kDataSentinel, + kFramebufferSizes, + runComputeTest, + runFragmentTest, } from './subgroup_util.js'; export const g = makeTestGroup(SubgroupTest); @@ -34,9 +38,8 @@ const kNumCases = 15; * Seeds 10+ generate all random data * @param seed The seed for the PRNG * @param num The number of data items to generate - * @param addCounter If true, treats the first index as an atomic counter */ -function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array { +function generateInputData(seed: number, num: number): Uint32Array { const prng = new PRNG(seed); const bound = Math.min(num, 32); @@ -44,17 +47,12 @@ function generateInputData(seed: number, num: number, addCounter: boolean): Uint return new Uint32Array([ ...iterRange(num, x => { - if (addCounter && x === 0) { - // Counter should start at 1 to avoid clear value. - return 1; - } - if (seed === 0) { return 0; } else if (seed === 1) { return 1; } else if (seed < 10) { - const bounded = (addCounter ? x + 1 : x) % bound; + const bounded = x % bound; return bounded === index ? 0 : 1; } return prng.uniformInt(2); @@ -174,8 +172,7 @@ fn main( outputs[lid] = res; }`; - const includeCounter = false; - const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + const inputData = generateInputData(t.params.case, wgThreads); const uintsPerOutput = 2; await runComputeTest( @@ -246,8 +243,7 @@ fn main( } }`; - const includeCounter = false; - const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + const inputData = generateInputData(t.params.case, wgThreads); const uintsPerOutput = 2; await runComputeTest( @@ -262,4 +258,133 @@ fn main( ); }); -g.test('fragment').unimplemented(); +/** + * Checks subgroupAll results from a fragment shader. + * + * @param data Framebuffer output + * * component 0 is result + * * component 1 is generated subgroup id + * @param input An array of input data + * @param format The framebuffer format + * @param width Framebuffer width + * @param height Framebuffer height + */ +function checkFragmentAll( + data: Uint32Array, + input: Uint32Array, + format: GPUTextureFormat, + width: number, + height: number +): Error | undefined { + const { blockWidth, blockHeight, bytesPerBlock } = kTextureFormatInfo[format]; + const blocksPerRow = width / blockWidth; + // 256 minimum comes from image copy requirements. + const bytesPerRow = align(blocksPerRow * (bytesPerBlock ?? 1), 256); + const uintsPerRow = bytesPerRow / 4; + const uintsPerTexel = (bytesPerBlock ?? 1) / blockWidth / blockHeight / 4; + + // Iteration skips last row and column to avoid helper invocations because it is not + // guaranteed whether or not they participate in the subgroup operation. + const expected = new Map(); + for (let row = 0; row < height - 1; row++) { + for (let col = 0; col < width - 1; col++) { + const offset = uintsPerRow * row + col * uintsPerTexel; + const subgroup_id = data[offset + 1]; + + if (subgroup_id === 0) { + return new Error(`Internal error: helper invocation at (${col}, ${row})`); + } + + let v = expected.get(subgroup_id) ?? 1; + // First index of input is an atomic counter. + v &= input[row * width + col]; + expected.set(subgroup_id, v); + } + } + + for (let row = 0; row < height - 1; row++) { + for (let col = 0; col < width - 1; col++) { + const offset = uintsPerRow * row + col * uintsPerTexel; + const res = data[offset]; + const subgroup_id = data[offset + 1]; + + if (subgroup_id === 0) { + // Inactive in the fragment. + continue; + } + + const expected_v = expected.get(subgroup_id) ?? 0; + if (expected_v !== res) { + return new Error(`Row ${row}, col ${col}: incorrect results: +- expected: ${expected_v} +- got: ${res}`); + } + } + } + + return undefined; +} + +g.test('fragment,all_active') + .desc('Tests subgroupAll in fragment shaders') + .params(u => + u + .combine('size', kFramebufferSizes) + .beginSubcases() + .combine('case', [...iterRange(kNumCases, x => x)]) + .combineWithParams([{ format: 'rg32uint' }] as const) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const numInputs = t.params.size[0] * t.params.size[1]; + const inputData = generateInputData(t.params.case, numInputs); + + const fsShader = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@fragment +fn main( + @builtin(position) pos : vec4f, +) -> @location(0) vec2u { + // Generate a subgroup id based on linearized position, but avoid 0. + let linear = u32(pos.x) + u32(pos.y) * ${t.params.size[0]}; + var subgroup_id = linear + 1; + subgroup_id = subgroupBroadcastFirst(subgroup_id); + + // Filter out possible helper invocations. + let x_in_range = u32(pos.x) < (${t.params.size[0]} - 1); + let y_in_range = u32(pos.y) < (${t.params.size[1]} - 1); + let in_range = x_in_range && y_in_range; + let input = select(1u, inputs[linear], in_range); + + let res = select(0u, 1u, subgroupAll(bool(input))); + return vec2u(res, subgroup_id); +}`; + + await runFragmentTest( + t, + t.params.format, + fsShader, + t.params.size[0], + t.params.size[1], + inputData, + (data: Uint32Array) => { + return checkFragmentAll( + data, + inputData, + t.params.format, + t.params.size[0], + t.params.size[1] + ); + } + ); + }); + +// Using subgroup operations in control with fragment shaders +// quickly leads to unportable behavior. +g.test('fragment,split').unimplemented(); diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts index 6418eb141dc5..5d5b9de11420 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts @@ -10,14 +10,18 @@ local_invocation_index. Tests should avoid assuming there is. import { makeTestGroup } from '../../../../../../common/framework/test_group.js'; import { keysOf } from '../../../../../../common/util/data_tables.js'; import { iterRange } from '../../../../../../common/util/util.js'; +import { kTextureFormatInfo } from '../../../../../format_info.js'; +import { align } from '../../../../../util/math.js'; import { PRNG } from '../../../../../util/prng.js'; import { kWGSizes, kPredicateCases, SubgroupTest, - runComputeTest, kDataSentinel, + runComputeTest, + runFragmentTest, + kFramebufferSizes, } from './subgroup_util.js'; export const g = makeTestGroup(SubgroupTest); @@ -34,9 +38,8 @@ const kNumCases = 15; * Seeds 10+ generate all random data * @param seed The seed for the PRNG * @param num The number of data items to generate - * @param addCounter If true, treats the first index as an atomic counter */ -function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array { +function generateInputData(seed: number, num: number): Uint32Array { const prng = new PRNG(seed); const bound = Math.min(num, 32); @@ -44,17 +47,12 @@ function generateInputData(seed: number, num: number, addCounter: boolean): Uint return new Uint32Array([ ...iterRange(num, x => { - if (addCounter && x === 0) { - // Counter should start at 1 to avoid clear value. - return 1; - } - if (seed === 0) { return 0; } else if (seed === 1) { return 1; } else if (seed < 10) { - const bounded = (addCounter ? x + 1 : x) % bound; + const bounded = x % bound; return bounded === index ? 1 : 0; } return prng.uniformInt(2); @@ -174,8 +172,7 @@ fn main( outputs[lid] = res; }`; - const includeCounter = false; - const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + const inputData = generateInputData(t.params.case, wgThreads); const uintsPerOutput = 2; await runComputeTest( @@ -246,8 +243,7 @@ fn main( } }`; - const includeCounter = false; - const inputData = generateInputData(t.params.case, wgThreads, includeCounter); + const inputData = generateInputData(t.params.case, wgThreads); const uintsPerOutput = 2; await runComputeTest( @@ -262,4 +258,133 @@ fn main( ); }); -g.test('fragment').unimplemented(); +/** + * Checks subgroupAny results from a fragment shader. + * + * @param data Framebuffer output + * * component 0 is result + * * component 1 is generated subgroup id + * @param input An array of input data + * @param format The framebuffer format + * @param width Framebuffer width + * @param height Framebuffer height + */ +function checkFragmentAny( + data: Uint32Array, + input: Uint32Array, + format: GPUTextureFormat, + width: number, + height: number +): Error | undefined { + const { blockWidth, blockHeight, bytesPerBlock } = kTextureFormatInfo[format]; + const blocksPerRow = width / blockWidth; + // 256 minimum comes from image copy requirements. + const bytesPerRow = align(blocksPerRow * (bytesPerBlock ?? 1), 256); + const uintsPerRow = bytesPerRow / 4; + const uintsPerTexel = (bytesPerBlock ?? 1) / blockWidth / blockHeight / 4; + + // Iteration skips last row and column to avoid helper invocations because it is not + // guaranteed whether or not they participate in the subgroup operation. + const expected = new Map(); + for (let row = 0; row < height - 1; row++) { + for (let col = 0; col < width - 1; col++) { + const offset = uintsPerRow * row + col * uintsPerTexel; + const subgroup_id = data[offset + 1]; + + if (subgroup_id === 0) { + return new Error(`Internal error: helper invocation at (${col}, ${row})`); + } + + let v = expected.get(subgroup_id) ?? 0; + // First index of input is an atomic counter. + v |= input[row * width + col]; + expected.set(subgroup_id, v); + } + } + + for (let row = 0; row < height - 1; row++) { + for (let col = 0; col < width - 1; col++) { + const offset = uintsPerRow * row + col * uintsPerTexel; + const res = data[offset]; + const subgroup_id = data[offset + 1]; + + if (subgroup_id === 0) { + // Inactive in the fragment. + continue; + } + + const expected_v = expected.get(subgroup_id) ?? 0; + if (expected_v !== res) { + return new Error(`Row ${row}, col ${col}: incorrect results: +- expected: ${expected_v} +- got: ${res}`); + } + } + } + + return undefined; +} + +g.test('fragment,all_active') + .desc('Tests subgroupAny in fragment shaders') + .params(u => + u + .combine('size', kFramebufferSizes) + .beginSubcases() + .combine('case', [...iterRange(kNumCases, x => x)]) + .combineWithParams([{ format: 'rg32uint' }] as const) + ) + .beforeAllSubcases(t => { + t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName); + }) + .fn(async t => { + const numInputs = t.params.size[0] * t.params.size[1]; + const inputData = generateInputData(t.params.case, numInputs); + + const fsShader = ` +enable subgroups; + +@group(0) @binding(0) +var inputs : array; + +@fragment +fn main( + @builtin(position) pos : vec4f, +) -> @location(0) vec2u { + // Generate a subgroup id based on linearized position, but avoid 0. + let linear = u32(pos.x) + u32(pos.y) * ${t.params.size[0]}; + var subgroup_id = linear + 1; + subgroup_id = subgroupBroadcastFirst(subgroup_id); + + // Filter out possible helper invocations. + let x_in_range = u32(pos.x) < (${t.params.size[0]} - 1); + let y_in_range = u32(pos.y) < (${t.params.size[1]} - 1); + let in_range = x_in_range && y_in_range; + let input = select(0u, inputs[linear], in_range); + + let res = select(0u, 1u, subgroupAny(bool(input))); + return vec2u(res, subgroup_id); +}`; + + await runFragmentTest( + t, + t.params.format, + fsShader, + t.params.size[0], + t.params.size[1], + inputData, + (data: Uint32Array) => { + return checkFragmentAny( + data, + inputData, + t.params.format, + t.params.size[0], + t.params.size[1] + ); + } + ); + }); + +// Using subgroup operations in control with fragment shaders +// quickly leads to unportable behavior. +g.test('fragment,split').unimplemented(); diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts index 8749c136c3d2..9d147de1968b 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts @@ -1,8 +1,9 @@ import { assert, iterRange } from '../../../../../../common/util/util.js'; import { Float16Array } from '../../../../../../external/petamoriken/float16/float16.js'; +import { kTextureFormatInfo } from '../../../../../format_info.js'; import { GPUTest, TextureTestMixin } from '../../../../../gpu_test.js'; import { FPInterval } from '../../../../../util/floating_point.js'; -import { sparseScalarF16Range, sparseScalarF32Range } from '../../../../../util/math.js'; +import { sparseScalarF16Range, sparseScalarF32Range, align } from '../../../../../util/math.js'; import { PRNG } from '../../../../../util/prng.js'; export class SubgroupTest extends TextureTestMixin(GPUTest) {} @@ -419,3 +420,135 @@ export async function runComputeTest( t.expectOK(checkFunction(metadata, output)); } + +// Minimum size is [3, 3]. +export const kFramebufferSizes = [ + [15, 15], + [16, 16], + [17, 17], + [19, 13], + [13, 10], + [111, 3], + [3, 111], + [35, 3], + [3, 35], + [53, 13], + [13, 53], + [3, 3], +] as const; + +/** + * Runs a subgroup builtin test for fragment shaders + * + * This test draws a full screen triangle. + * Tests should avoid checking the last row or column to avoid helper + * invocations. Underlying APIs do not consistently guarantee whether + * helper invocations participate in subgroup operations. + * @param t The base test + * @param format The framebuffer format + * @param fsShader The fragment shader with the following interface: + * Location 0 output is framebuffer with format + * Group 0 binding 0 is input data + * @param width The framebuffer width + * @param height The framebuffer height + * @param inputData The input data + * @param checker A functor to check the framebuffer values + */ +export async function runFragmentTest( + t: SubgroupTest, + format: GPUTextureFormat, + fsShader: string, + width: number, + height: number, + inputData: Uint32Array | Float32Array | Float16Array, + checker: (data: Uint32Array) => Error | undefined +) { + const vsShader = ` +@vertex +fn vsMain(@builtin(vertex_index) index : u32) -> @builtin(position) vec4f { + const vertices = array( + vec2(-2, 4), vec2(-2, -4), vec2(2, 0), + ); + return vec4f(vec2f(vertices[index]), 0, 1); +}`; + + assert(width >= 3, 'Minimum width is 3'); + assert(height >= 3, 'Minimum height is 3'); + const pipeline = t.device.createRenderPipeline({ + layout: 'auto', + vertex: { + module: t.device.createShaderModule({ code: vsShader }), + }, + fragment: { + module: t.device.createShaderModule({ code: fsShader }), + targets: [{ format }], + }, + primitive: { + topology: 'triangle-list', + }, + }); + + const { blockWidth, blockHeight, bytesPerBlock } = kTextureFormatInfo[format]; + assert(bytesPerBlock !== undefined); + + const blocksPerRow = width / blockWidth; + const blocksPerColumn = height / blockHeight; + // 256 minimum arises from image copy requirements. + const bytesPerRow = align(blocksPerRow * (bytesPerBlock ?? 1), 256); + const byteLength = bytesPerRow * blocksPerColumn; + const uintLength = byteLength / 4; + + const buffer = t.makeBufferWithContents( + inputData, + GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST + ); + + const bg = t.device.createBindGroup({ + layout: pipeline.getBindGroupLayout(0), + entries: [ + { + binding: 0, + resource: { + buffer, + }, + }, + ], + }); + + const framebuffer = t.createTextureTracked({ + size: [width, height], + usage: + GPUTextureUsage.COPY_SRC | + GPUTextureUsage.COPY_DST | + GPUTextureUsage.RENDER_ATTACHMENT | + GPUTextureUsage.TEXTURE_BINDING, + format, + }); + + const encoder = t.device.createCommandEncoder(); + const pass = encoder.beginRenderPass({ + colorAttachments: [ + { + view: framebuffer.createView(), + loadOp: 'clear', + storeOp: 'store', + }, + ], + }); + pass.setPipeline(pipeline); + pass.setBindGroup(0, bg); + pass.draw(3); + pass.end(); + t.queue.submit([encoder.finish()]); + + const copyBuffer = t.copyWholeTextureToNewBufferSimple(framebuffer, 0); + const readback = await t.readGPUBufferRangeTyped(copyBuffer, { + srcByteOffset: 0, + type: Uint32Array, + typedLength: uintLength, + method: 'copy', + }); + const data: Uint32Array = readback.data; + + t.expectOK(checker(data)); +}