From 077ffee360f156d7638bce8b3b762c196f4a0793 Mon Sep 17 00:00:00 2001
From: Greggman <github@greggman.com>
Date: Thu, 9 Jan 2025 11:24:13 +0900
Subject: [PATCH] Compat: Refactor fwidth/Fine/Coarse for 0 storage buffers.
 (#4128)

Modified so this test doesn't use storage buffers by having
it return values from a fragment shader as rgba32uint
---
 .../expression/call/builtin/derivatives.ts    |  11 +-
 .../expression/call/builtin/fwidth.ts         | 283 ++++++++++--------
 2 files changed, 161 insertions(+), 133 deletions(-)

diff --git a/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts b/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts
index b6c7d54669d3..40f0adf279f0 100644
--- a/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts
+++ b/src/webgpu/shader/execution/expression/call/builtin/derivatives.ts
@@ -33,12 +33,11 @@ export function runDerivativeTest(
   // We will populate a uniform buffer with these input values laid out sequentially:
   // [ case_0_input_1, case_0_input_0, case_1_input_1, case_1_input_0, ...]
   //
-  // The render pipeline will be launched once per pixel per pair of cases over
-  // a viewport size of (2, 2) with the viewport set to cover 1 pixel.
-  // Each 2x2 set of calls will will exercise two test cases. Each of these
-  // draw calls will use a different instance index, which is forwarded to the
-  // fragment shader. Each invocation returns the result which is stored in
-  // a rgba32uint texture.
+  // The render pipeline will be launched once per pair of cases over a viewport
+  // size of (2, 2). Each 2x2 set of calls will will exercise two test cases.
+  // Each of these draw calls will use a different instance index, which is
+  // forwarded to the fragment shader. Each invocation returns the result which
+  // is stored in a rgba32uint texture.
   //
   // Consider draw calls that test 4 cases (c_0, c_1, c_2, c_3).
   //
diff --git a/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts b/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts
index d87f6b06c2da..795df55ce97c 100644
--- a/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts
+++ b/src/webgpu/shader/execution/expression/call/builtin/fwidth.ts
@@ -1,7 +1,9 @@
+import { assert } from '../../../../../../common/util/util.js';
 import { GPUTest } from '../../../../../gpu_test.js';
 import { anyOf } from '../../../../../util/compare.js';
 import { Type, Value } from '../../../../../util/conversion.js';
 import { FPInterval } from '../../../../../util/floating_point.js';
+import { align } from '../../../../../util/math.js';
 import { Case } from '../../case.js';
 import { toComparator } from '../../expectation.js';
 
@@ -22,14 +24,11 @@ export function runFWidthTest(
 ) {
   ////////////////////////////////////////////////////////////////
   // The four input values for a given case are distributed to across the invocations in a quad.
-  // We will populate a storage buffer with these input values laid out sequentially:
+  // We will populate a uniform buffer with these input values laid out sequentially:
   // [ case0_input0, case0_input1, case0_input2, case0_input3, ...]
   //
-  // The render pipeline will be launched several times over a viewport size of (2, 2). Each draw
-  // call will execute a single quad (four fragment invocation), which will exercise one test case.
-  // Each of these draw calls will use a different instance index, which is forwarded to the
-  // fragment shader. Each invocation will determine its index into the storage buffer using its
-  // fragment position and the instance index for that draw call.
+  // The render pipeline a 512x2 texture. In the fragment shader, every 2x2 texels is one test case.
+  // The results are the output from the fragment shader.
   //
   // Consider two draw calls that test 2 cases (c0, c1).
   //
@@ -46,46 +45,56 @@ export function runFWidthTest(
   }
 
   // Determine the WGSL type to use in the shader, and the stride in bytes between values.
-  let valueStride = 4;
-  let wgslType = 'f32';
+  const valueStride = 16;
+  let conversionFromInput = 'input.x';
+  let conversionToOutput = `vec4f(v, 0, 0, 0)`;
   if (vectorize) {
-    wgslType = `vec${vectorize}f`;
-    valueStride = vectorize * 4;
-    if (vectorize === 3) {
-      valueStride = 16;
+    switch (vectorize) {
+      case 2:
+        conversionFromInput = 'input.xy';
+        conversionToOutput = 'vec4f(v, 0, 0)';
+        break;
+      case 3:
+        conversionFromInput = 'input.xyz';
+        conversionToOutput = 'vec4f(v, 0)';
+        break;
+      case 4:
+        conversionFromInput = 'input';
+        conversionToOutput = 'v';
+        break;
     }
   }
 
+  const kUniformBufferSize = 16384; // min supported by compat mode.
+  const kNumCasesPerUniformBuffer = kUniformBufferSize / 64;
+
   // Define a vertex shader that draws a triangle over the full viewport, and a fragment shader that
   // calls the fwidth builtin with a value loaded from that fragment's index into the storage
   // buffer (determined using the quad index and fragment position, as described above).
   const code = `
-struct CaseInfo {
-  @builtin(position) position: vec4f,
-  @location(0) @interpolate(flat, either) quad_idx: u32,
-}
-
 @vertex
-fn vert(@builtin(vertex_index) vertex_idx: u32,
-        @builtin(instance_index) instance_idx: u32) -> CaseInfo {
+fn vert(@builtin(vertex_index) vertex_idx: u32) -> @builtin(position) vec4f {
   const kVertices = array(
-    vec2f(-2, -2),
-    vec2f( 2, -2),
-    vec2f( 0,  2),
+    vec2f( 3, -1),
+    vec2f(-1,  3),
+    vec2f(-1, -1),
   );
-  return CaseInfo(vec4(kVertices[vertex_idx], 0, 1), instance_idx);
+  return vec4(kVertices[vertex_idx], 0, 1);
 }
 
-@group(0) @binding(0) var<storage, read> inputs : array<${wgslType}>;
-@group(0) @binding(1) var<storage, read_write> outputs : array<${wgslType}>;
+@group(0) @binding(0) var<uniform> inputs : array<vec4f, ${kNumCasesPerUniformBuffer * 4}>;
 
 @fragment
-fn frag(info : CaseInfo) {
-  let inv_idx = u32(info.position.x) + u32(info.position.y)*2;
-  let index = info.quad_idx*4 + inv_idx;
+fn frag(@builtin(position) position: vec4f) -> @location(0) vec4u {
+  let t = vec2u(position.xy);
+  let inv_idx = t.x % 2 + (t.y % 2) * 2;
+  let q = t / 2;
+  let quad_idx = q.y * 256 + q.x;
+  let index = quad_idx * 4 + inv_idx;
   let input = inputs[index];
   ${non_uniform_discard ? 'if inv_idx == 0 { discard; }' : ''}
-  outputs[index] = ${builtin}(input);
+  let v = ${builtin}(${conversionFromInput});
+  return bitcast<vec4u>(${conversionToOutput});
 }
 `;
 
@@ -94,116 +103,136 @@ fn frag(info : CaseInfo) {
   const pipeline = t.device.createRenderPipeline({
     layout: 'auto',
     vertex: { module },
-    fragment: { module, targets: [{ format: 'rgba8unorm', writeMask: 0 }] },
-  });
-
-  // Create storage buffers to hold the inputs and outputs.
-  const bufferSize = cases.length * 4 * valueStride;
-  const inputBuffer = t.createBufferTracked({
-    size: bufferSize,
-    usage: GPUBufferUsage.STORAGE,
-    mappedAtCreation: true,
-  });
-  const outputBuffer = t.createBufferTracked({
-    size: bufferSize,
-    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    fragment: { module, targets: [{ format: 'rgba32uint' }] },
   });
 
-  // Populate the input storage buffer with case input values.
-  const valuesData = new Uint8Array(inputBuffer.getMappedRange());
-  for (let i = 0; i < cases.length / vectorWidth; i++) {
-    for (let v = 0; v < vectorWidth; v++) {
-      const index = i * vectorWidth + v;
-      if (index >= cases.length) {
-        break;
-      }
-      const inputs = cases[index].input as ReadonlyArray<Value>;
-      for (let x = 0; x < 4; x++) {
-        inputs[x].copyTo(valuesData, (i * 4 + x) * valueStride + v * 4);
-      }
-    }
-  }
-  inputBuffer.unmap();
-
-  // Create a bind group for the storage buffers.
-  const group = t.device.createBindGroup({
-    entries: [
-      { binding: 0, resource: { buffer: inputBuffer } },
-      { binding: 1, resource: { buffer: outputBuffer } },
-    ],
-    layout: pipeline.getBindGroupLayout(0),
-  });
-
-  // Create a texture to use as a color attachment.
-  // We only need this for launching the desired number of fragment invocations.
+  // Create a texture to use as a color attachment to receive the results;
+  const width = kNumCasesPerUniformBuffer * 2;
+  const height = 2;
+  // note: We could limit it to this size and increase height but kNumCasesPerUniformBuffer is limited to 256
+  // because we can't fit more into a single uniform buffer in compat.
+  assert(width < t.device.limits.maxTextureDimension2D);
   const colorAttachment = t.createTextureTracked({
-    size: { width: 2, height: 2 },
-    format: 'rgba8unorm',
-    usage: GPUTextureUsage.RENDER_ATTACHMENT,
+    size: [width, height],
+    format: 'rgba32uint',
+    usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_SRC,
   });
+  const bytesPerRow = align(width * 16, 256);
 
-  // Submit the render pass to the device.
+  const results = [];
   const encoder = t.device.createCommandEncoder();
-  const pass = encoder.beginRenderPass({
-    colorAttachments: [
-      {
-        view: colorAttachment.createView(),
-        loadOp: 'clear',
-        storeOp: 'discard',
-      },
-    ],
-  });
-  pass.setPipeline(pipeline);
-  pass.setBindGroup(0, group);
-  for (let quad = 0; quad < cases.length / vectorWidth; quad++) {
-    pass.draw(3, 1, undefined, quad);
+  for (let c = 0; c < cases.length; c += kNumCasesPerUniformBuffer) {
+    // Create uniform buffer to hold the inputs.
+    const inputBuffer = t.createBufferTracked({
+      size: kUniformBufferSize,
+      usage: GPUBufferUsage.UNIFORM,
+      mappedAtCreation: true,
+    });
+    const valuesData = new Uint8Array(inputBuffer.getMappedRange());
+
+    // Populate the input uniform buffer with case input values.
+    for (let i = 0; i < kNumCasesPerUniformBuffer / vectorWidth; i++) {
+      for (let v = 0; v < vectorWidth; v++) {
+        const index = c + i * vectorWidth + v;
+        if (index >= cases.length) {
+          break;
+        }
+        const inputs = cases[index].input as ReadonlyArray<Value>;
+        for (let x = 0; x < 4; x++) {
+          inputs[x].copyTo(valuesData, (i * 4 + x) * valueStride + v * 4);
+        }
+      }
+    }
+    inputBuffer.unmap();
+
+    // Create a bind group for the input buffer.
+    const group = t.device.createBindGroup({
+      entries: [{ binding: 0, resource: { buffer: inputBuffer } }],
+      layout: pipeline.getBindGroupLayout(0),
+    });
+
+    // Submit the render pass to the device.
+    const pass = encoder.beginRenderPass({
+      colorAttachments: [
+        {
+          view: colorAttachment.createView(),
+          loadOp: 'clear',
+          storeOp: 'store',
+        },
+      ],
+    });
+    pass.setPipeline(pipeline);
+    pass.setBindGroup(0, group);
+    pass.draw(3);
+    pass.end();
+
+    // Create buffer to hold the outputs.
+    const outputBuffer = t.createBufferTracked({
+      size: bytesPerRow * height,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
+    });
+    results.push(outputBuffer);
+
+    // Copy the texture to the output buffer
+    encoder.copyTextureToBuffer(
+      { texture: colorAttachment },
+      { buffer: outputBuffer, bytesPerRow },
+      [colorAttachment.width, colorAttachment.height]
+    );
   }
-  pass.end();
   t.queue.submit([encoder.finish()]);
 
-  // Check the outputs match the expected results.
-  t.expectGPUBufferValuesPassCheck(
-    outputBuffer,
-    (outputData: Uint8Array) => {
-      for (let i = 0; i < cases.length / vectorWidth; i++) {
-        for (let v = 0; v < vectorWidth; v++) {
-          const index = i * vectorWidth + v;
-          if (index >= cases.length) {
-            break;
-          }
-          const c = cases[index];
-
-          for (let x = 0; x < 4; x++) {
-            if (non_uniform_discard && x === 0) {
-              continue;
+  results.forEach((outputBuffer, groupNdx) => {
+    // Check the outputs match the expected results.
+    t.expectGPUBufferValuesPassCheck(
+      outputBuffer,
+      (outputData: Uint8Array) => {
+        const base = groupNdx * kNumCasesPerUniformBuffer;
+        const numCases = Math.min(kNumCasesPerUniformBuffer, cases.length - base);
+        const numQuads = numCases / vectorWidth;
+        for (let i = 0; i < numQuads; i++) {
+          for (let v = 0; v < vectorWidth; v++) {
+            const caseNdx = base + i * vectorWidth + v;
+            if (caseNdx >= cases.length) {
+              break;
             }
-
-            const index = (i * 4 + x) * valueStride + v * 4;
-            const result = Type.f32.read(outputData, index);
-
-            let expected = c.expected;
-            if (builtin.endsWith('Fine')) {
-              expected = toComparator((expected as FPInterval[])[x]);
-            } else {
-              expected = anyOf(...(expected as FPInterval[]));
-            }
-
-            const cmp = expected.compare(result);
-            if (!cmp.matched) {
-              return new Error(`
-    inputs: (${(c.input as Value[]).join(', ')})
-  expected: ${cmp.expected}
-
-  returned: ${result}`);
+            const c = cases[caseNdx];
+
+            for (let x = 0; x < 4; x++) {
+              if (non_uniform_discard && x === 0) {
+                continue;
+              }
+
+              const tx = x % 2;
+              const ty = (x / 2) | 0;
+              const index = ty * bytesPerRow + i * 32 + tx * 16 + v * 4;
+              const result = Type.f32.read(outputData, index);
+
+              let expected = c.expected;
+              if (builtin.endsWith('Fine')) {
+                expected = toComparator((expected as FPInterval[])[x]);
+              } else {
+                expected = anyOf(...(expected as FPInterval[]));
+              }
+
+              const cmp = expected.compare(result);
+              if (!cmp.matched) {
+                return new Error(`
+     caseNdx: ${caseNdx} v: ${v} x: ${x}
+      inputs: (${(c.input as Value[]).join(', ')})
+    expected: ${cmp.expected}
+
+    returned: ${result}`);
+              }
             }
           }
         }
+        return undefined;
+      },
+      {
+        type: Uint8Array,
+        typedLength: outputBuffer.size,
       }
-      return undefined;
-    },
-    {
-      type: Uint8Array,
-      typedLength: bufferSize,
-    }
-  );
+    );
+  });
 }