From 077ffee360f156d7638bce8b3b762c196f4a0793 Mon Sep 17 00:00:00 2001 From: Greggman Date: Thu, 9 Jan 2025 11:24:13 +0900 Subject: [PATCH] Compat: Refactor fwidth/Fine/Coarse for 0 storage buffers. (#4128) Modified so this test doesn't use storage buffers by having it return values from a fragment shader as rgba32uint --- .../expression/call/builtin/derivatives.ts | 11 +- .../expression/call/builtin/fwidth.ts | 283 ++++++++++-------- 2 files changed, 161 insertions(+), 133 deletions(-) diff --git a/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts b/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts index b6c7d54669d3..40f0adf279f0 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts @@ -33,12 +33,11 @@ export function runDerivativeTest( // We will populate a uniform buffer with these input values laid out sequentially: // [ case_0_input_1, case_0_input_0, case_1_input_1, case_1_input_0, ...] // - // The render pipeline will be launched once per pixel per pair of cases over - // a viewport size of (2, 2) with the viewport set to cover 1 pixel. - // Each 2x2 set of calls will will exercise two test cases. Each of these - // draw calls will use a different instance index, which is forwarded to the - // fragment shader. Each invocation returns the result which is stored in - // a rgba32uint texture. + // The render pipeline will be launched once per pair of cases over a viewport + // size of (2, 2). Each 2x2 set of calls will will exercise two test cases. + // Each of these draw calls will use a different instance index, which is + // forwarded to the fragment shader. Each invocation returns the result which + // is stored in a rgba32uint texture. // // Consider draw calls that test 4 cases (c_0, c_1, c_2, c_3). // diff --git a/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts b/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts index d87f6b06c2da..795df55ce97c 100644 --- a/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts +++ b/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts @@ -1,7 +1,9 @@ +import { assert } from '../../../../../../common/util/util.js'; import { GPUTest } from '../../../../../gpu_test.js'; import { anyOf } from '../../../../../util/compare.js'; import { Type, Value } from '../../../../../util/conversion.js'; import { FPInterval } from '../../../../../util/floating_point.js'; +import { align } from '../../../../../util/math.js'; import { Case } from '../../case.js'; import { toComparator } from '../../expectation.js'; @@ -22,14 +24,11 @@ export function runFWidthTest( ) { //////////////////////////////////////////////////////////////// // The four input values for a given case are distributed to across the invocations in a quad. - // We will populate a storage buffer with these input values laid out sequentially: + // We will populate a uniform buffer with these input values laid out sequentially: // [ case0_input0, case0_input1, case0_input2, case0_input3, ...] // - // The render pipeline will be launched several times over a viewport size of (2, 2). Each draw - // call will execute a single quad (four fragment invocation), which will exercise one test case. - // Each of these draw calls will use a different instance index, which is forwarded to the - // fragment shader. Each invocation will determine its index into the storage buffer using its - // fragment position and the instance index for that draw call. + // The render pipeline a 512x2 texture. In the fragment shader, every 2x2 texels is one test case. + // The results are the output from the fragment shader. // // Consider two draw calls that test 2 cases (c0, c1). // @@ -46,46 +45,56 @@ export function runFWidthTest( } // Determine the WGSL type to use in the shader, and the stride in bytes between values. - let valueStride = 4; - let wgslType = 'f32'; + const valueStride = 16; + let conversionFromInput = 'input.x'; + let conversionToOutput = `vec4f(v, 0, 0, 0)`; if (vectorize) { - wgslType = `vec${vectorize}f`; - valueStride = vectorize * 4; - if (vectorize === 3) { - valueStride = 16; + switch (vectorize) { + case 2: + conversionFromInput = 'input.xy'; + conversionToOutput = 'vec4f(v, 0, 0)'; + break; + case 3: + conversionFromInput = 'input.xyz'; + conversionToOutput = 'vec4f(v, 0)'; + break; + case 4: + conversionFromInput = 'input'; + conversionToOutput = 'v'; + break; } } + const kUniformBufferSize = 16384; // min supported by compat mode. + const kNumCasesPerUniformBuffer = kUniformBufferSize / 64; + // Define a vertex shader that draws a triangle over the full viewport, and a fragment shader that // calls the fwidth builtin with a value loaded from that fragment's index into the storage // buffer (determined using the quad index and fragment position, as described above). const code = ` -struct CaseInfo { - @builtin(position) position: vec4f, - @location(0) @interpolate(flat, either) quad_idx: u32, -} - @vertex -fn vert(@builtin(vertex_index) vertex_idx: u32, - @builtin(instance_index) instance_idx: u32) -> CaseInfo { +fn vert(@builtin(vertex_index) vertex_idx: u32) -> @builtin(position) vec4f { const kVertices = array( - vec2f(-2, -2), - vec2f( 2, -2), - vec2f( 0, 2), + vec2f( 3, -1), + vec2f(-1, 3), + vec2f(-1, -1), ); - return CaseInfo(vec4(kVertices[vertex_idx], 0, 1), instance_idx); + return vec4(kVertices[vertex_idx], 0, 1); } -@group(0) @binding(0) var inputs : array<${wgslType}>; -@group(0) @binding(1) var outputs : array<${wgslType}>; +@group(0) @binding(0) var inputs : array; @fragment -fn frag(info : CaseInfo) { - let inv_idx = u32(info.position.x) + u32(info.position.y)*2; - let index = info.quad_idx*4 + inv_idx; +fn frag(@builtin(position) position: vec4f) -> @location(0) vec4u { + let t = vec2u(position.xy); + let inv_idx = t.x % 2 + (t.y % 2) * 2; + let q = t / 2; + let quad_idx = q.y * 256 + q.x; + let index = quad_idx * 4 + inv_idx; let input = inputs[index]; ${non_uniform_discard ? 'if inv_idx == 0 { discard; }' : ''} - outputs[index] = ${builtin}(input); + let v = ${builtin}(${conversionFromInput}); + return bitcast(${conversionToOutput}); } `; @@ -94,116 +103,136 @@ fn frag(info : CaseInfo) { const pipeline = t.device.createRenderPipeline({ layout: 'auto', vertex: { module }, - fragment: { module, targets: [{ format: 'rgba8unorm', writeMask: 0 }] }, - }); - - // Create storage buffers to hold the inputs and outputs. - const bufferSize = cases.length * 4 * valueStride; - const inputBuffer = t.createBufferTracked({ - size: bufferSize, - usage: GPUBufferUsage.STORAGE, - mappedAtCreation: true, - }); - const outputBuffer = t.createBufferTracked({ - size: bufferSize, - usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC, + fragment: { module, targets: [{ format: 'rgba32uint' }] }, }); - // Populate the input storage buffer with case input values. - const valuesData = new Uint8Array(inputBuffer.getMappedRange()); - for (let i = 0; i < cases.length / vectorWidth; i++) { - for (let v = 0; v < vectorWidth; v++) { - const index = i * vectorWidth + v; - if (index >= cases.length) { - break; - } - const inputs = cases[index].input as ReadonlyArray; - for (let x = 0; x < 4; x++) { - inputs[x].copyTo(valuesData, (i * 4 + x) * valueStride + v * 4); - } - } - } - inputBuffer.unmap(); - - // Create a bind group for the storage buffers. - const group = t.device.createBindGroup({ - entries: [ - { binding: 0, resource: { buffer: inputBuffer } }, - { binding: 1, resource: { buffer: outputBuffer } }, - ], - layout: pipeline.getBindGroupLayout(0), - }); - - // Create a texture to use as a color attachment. - // We only need this for launching the desired number of fragment invocations. + // Create a texture to use as a color attachment to receive the results; + const width = kNumCasesPerUniformBuffer * 2; + const height = 2; + // note: We could limit it to this size and increase height but kNumCasesPerUniformBuffer is limited to 256 + // because we can't fit more into a single uniform buffer in compat. + assert(width < t.device.limits.maxTextureDimension2D); const colorAttachment = t.createTextureTracked({ - size: { width: 2, height: 2 }, - format: 'rgba8unorm', - usage: GPUTextureUsage.RENDER_ATTACHMENT, + size: [width, height], + format: 'rgba32uint', + usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_SRC, }); + const bytesPerRow = align(width * 16, 256); - // Submit the render pass to the device. + const results = []; const encoder = t.device.createCommandEncoder(); - const pass = encoder.beginRenderPass({ - colorAttachments: [ - { - view: colorAttachment.createView(), - loadOp: 'clear', - storeOp: 'discard', - }, - ], - }); - pass.setPipeline(pipeline); - pass.setBindGroup(0, group); - for (let quad = 0; quad < cases.length / vectorWidth; quad++) { - pass.draw(3, 1, undefined, quad); + for (let c = 0; c < cases.length; c += kNumCasesPerUniformBuffer) { + // Create uniform buffer to hold the inputs. + const inputBuffer = t.createBufferTracked({ + size: kUniformBufferSize, + usage: GPUBufferUsage.UNIFORM, + mappedAtCreation: true, + }); + const valuesData = new Uint8Array(inputBuffer.getMappedRange()); + + // Populate the input uniform buffer with case input values. + for (let i = 0; i < kNumCasesPerUniformBuffer / vectorWidth; i++) { + for (let v = 0; v < vectorWidth; v++) { + const index = c + i * vectorWidth + v; + if (index >= cases.length) { + break; + } + const inputs = cases[index].input as ReadonlyArray; + for (let x = 0; x < 4; x++) { + inputs[x].copyTo(valuesData, (i * 4 + x) * valueStride + v * 4); + } + } + } + inputBuffer.unmap(); + + // Create a bind group for the input buffer. + const group = t.device.createBindGroup({ + entries: [{ binding: 0, resource: { buffer: inputBuffer } }], + layout: pipeline.getBindGroupLayout(0), + }); + + // Submit the render pass to the device. + const pass = encoder.beginRenderPass({ + colorAttachments: [ + { + view: colorAttachment.createView(), + loadOp: 'clear', + storeOp: 'store', + }, + ], + }); + pass.setPipeline(pipeline); + pass.setBindGroup(0, group); + pass.draw(3); + pass.end(); + + // Create buffer to hold the outputs. + const outputBuffer = t.createBufferTracked({ + size: bytesPerRow * height, + usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, + }); + results.push(outputBuffer); + + // Copy the texture to the output buffer + encoder.copyTextureToBuffer( + { texture: colorAttachment }, + { buffer: outputBuffer, bytesPerRow }, + [colorAttachment.width, colorAttachment.height] + ); } - pass.end(); t.queue.submit([encoder.finish()]); - // Check the outputs match the expected results. - t.expectGPUBufferValuesPassCheck( - outputBuffer, - (outputData: Uint8Array) => { - for (let i = 0; i < cases.length / vectorWidth; i++) { - for (let v = 0; v < vectorWidth; v++) { - const index = i * vectorWidth + v; - if (index >= cases.length) { - break; - } - const c = cases[index]; - - for (let x = 0; x < 4; x++) { - if (non_uniform_discard && x === 0) { - continue; + results.forEach((outputBuffer, groupNdx) => { + // Check the outputs match the expected results. + t.expectGPUBufferValuesPassCheck( + outputBuffer, + (outputData: Uint8Array) => { + const base = groupNdx * kNumCasesPerUniformBuffer; + const numCases = Math.min(kNumCasesPerUniformBuffer, cases.length - base); + const numQuads = numCases / vectorWidth; + for (let i = 0; i < numQuads; i++) { + for (let v = 0; v < vectorWidth; v++) { + const caseNdx = base + i * vectorWidth + v; + if (caseNdx >= cases.length) { + break; } - - const index = (i * 4 + x) * valueStride + v * 4; - const result = Type.f32.read(outputData, index); - - let expected = c.expected; - if (builtin.endsWith('Fine')) { - expected = toComparator((expected as FPInterval[])[x]); - } else { - expected = anyOf(...(expected as FPInterval[])); - } - - const cmp = expected.compare(result); - if (!cmp.matched) { - return new Error(` - inputs: (${(c.input as Value[]).join(', ')}) - expected: ${cmp.expected} - - returned: ${result}`); + const c = cases[caseNdx]; + + for (let x = 0; x < 4; x++) { + if (non_uniform_discard && x === 0) { + continue; + } + + const tx = x % 2; + const ty = (x / 2) | 0; + const index = ty * bytesPerRow + i * 32 + tx * 16 + v * 4; + const result = Type.f32.read(outputData, index); + + let expected = c.expected; + if (builtin.endsWith('Fine')) { + expected = toComparator((expected as FPInterval[])[x]); + } else { + expected = anyOf(...(expected as FPInterval[])); + } + + const cmp = expected.compare(result); + if (!cmp.matched) { + return new Error(` + caseNdx: ${caseNdx} v: ${v} x: ${x} + inputs: (${(c.input as Value[]).join(', ')}) + expected: ${cmp.expected} + + returned: ${result}`); + } } } } + return undefined; + }, + { + type: Uint8Array, + typedLength: outputBuffer.size, } - return undefined; - }, - { - type: Uint8Array, - typedLength: bufferSize, - } - ); + ); + }); }