Execution tests for subgroupAny and subgroupAll (#3924)

* Compute tests with all active invocation and partially active invocations * Removed unimplemented dynamically uniform subgroupBroadcast test (due to const requirement)
gpuweb · Aug 30, 2024 · 9b30f7a · 9b30f7a
1 parent 3d24384
commit 9b30f7a
Showing 5 changed files with 543 additions and 6 deletions.
diff --git a/src/webgpu/listing_meta.json b/src/webgpu/listing_meta.json
@@ -1529,13 +1529,18 @@
   "webgpu:shader,execution,expression,call,builtin,subgroupAdd:data_types:*": { "subcaseMS": 9216.247 },
   "webgpu:shader,execution,expression,call,builtin,subgroupAdd:fp_accuracy:*": { "subcaseMS": 9952.350 },
   "webgpu:shader,execution,expression,call,builtin,subgroupAdd:fragment:*": { "subcaseMS": 0.229 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,all_active:*": { "subcaseMS": 5162.414 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAll:compute,split:*": { "subcaseMS": 26610.627 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAll:fragment:*": { "subcaseMS": 0.172 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,all_active:*": { "subcaseMS": 7028.394 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAny:compute,split:*": { "subcaseMS": 50.998 },
+  "webgpu:shader,execution,expression,call,builtin,subgroupAny:fragment:*": { "subcaseMS": 0.227 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBallot:compute,split:*": { "subcaseMS": 38.740 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment,split:*": { "subcaseMS": 0.331 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBallot:fragment:*": { "subcaseMS": 0.059 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBallot:predicate:*": { "subcaseMS": 0.075 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBallot:predicate_and_control_flow:*": { "subcaseMS": 41.053 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:data_types:*": { "subcaseMS": 252.374 },
-  "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:dynamically_uniform_id:*": { "subcaseMS": 0.211 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:fragment:*": { "subcaseMS": 0.108 },
   "webgpu:shader,execution,expression,call,builtin,subgroupBroadcast:workgroup_uniform_load:*": { "subcaseMS": 109.832 },
   "webgpu:shader,execution,expression,call,builtin,subgroupMul:compute,split:*": { "subcaseMS": 5034.263 },

diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAll.spec.ts
@@ -0,0 +1,265 @@
+export const description = `
+Execution tests for subgroupAll.
+
+Note: There is a lack of portability for non-uniform execution so these tests
+restrict themselves to uniform control flow.
+Note: There is no guaranteed mapping between subgroup_invocation_id and
+local_invocation_index. Tests should avoid assuming there is.
+`;
+
+import { makeTestGroup } from '../../../../../../common/framework/test_group.js';
+import { keysOf } from '../../../../../../common/util/data_tables.js';
+import { iterRange } from '../../../../../../common/util/util.js';
+import { PRNG } from '../../../../../util/prng.js';
+
+import {
+  kWGSizes,
+  kPredicateCases,
+  SubgroupTest,
+  runComputeTest,
+  kDataSentinel,
+} from './subgroup_util.js';
+
+export const g = makeTestGroup(SubgroupTest);
+
+const kNumCases = 15;
+
+/**
+ * Generate input data for testing.
+ *
+ * Data is generated in the following categories:
+ * Seed 0 generates all 0 data
+ * Seed 1 generates all 1 data
+ * Seeds 2-9 generates all 1s except for a zero randomly once per 32 elements
+ * Seeds 10+ generate all random data
+ * @param seed The seed for the PRNG
+ * @param num The number of data items to generate
+ * @param addCounter If true, treats the first index as an atomic counter
+ */
+function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array {
+  const prng = new PRNG(seed);
+
+  const bound = Math.min(num, 32);
+  const index = prng.uniformInt(bound);
+
+  return new Uint32Array([
+    ...iterRange(num, x => {
+      if (addCounter && x === 0) {
+        // Counter should start at 1 to avoid clear value.
+        return 1;
+      }
+
+      if (seed === 0) {
+        return 0;
+      } else if (seed === 1) {
+        return 1;
+      } else if (seed < 10) {
+        const bounded = (addCounter ? x + 1 : x) % bound;
+        return bounded === index ? 0 : 1;
+      }
+      return prng.uniformInt(2);
+    }),
+  ]);
+}
+
+/**
+ * Checks the result of a subgroupAll operation
+ *
+ * Since subgroup size depends on the pipeline compile, we calculate the expected
+ * results after execution. The shader generates a subgroup id and records it for
+ * each invocation. The check first calculates the expected result for each subgroup
+ * and then compares to the actual result for each invocation. The filter functor
+ * ensures only the correct invocations contribute to the calculation.
+ * @param metadata An array of uints:
+ *                 * first half containing subgroup sizes (from builtin value)
+ *                 * second half subgroup invocation id
+ * @param output An array of uints containing:
+ *               * first half is the outputs of subgroupAll
+ *               * second half is a generated subgroup id
+ * @param numInvs Number of invocations executed
+ * @param input The input data (equal size to output)
+ * @param filter A functor to filter active invocations
+ */
+function checkAll(
+  metadata: Uint32Array, // unused
+  output: Uint32Array,
+  numInvs: number,
+  input: Uint32Array,
+  filter: (id: number, size: number) => boolean
+): Error | undefined {
+  // First, generate expected results.
+  const expected = new Map<number, number>();
+  for (let inv = 0; inv < numInvs; inv++) {
+    const size = metadata[inv];
+    const id = metadata[inv + numInvs];
+    if (!filter(id, size)) {
+      continue;
+    }
+    const subgroup_id = output[numInvs + inv];
+    let v = expected.get(subgroup_id) ?? 1;
+    v &= input[inv];
+    expected.set(subgroup_id, v);
+  }
+
+  // Second, check against actual results.
+  for (let inv = 0; inv < numInvs; inv++) {
+    const size = metadata[inv];
+    const id = metadata[inv + numInvs];
+    const res = output[inv];
+    if (filter(id, size)) {
+      const subgroup_id = output[numInvs + inv];
+      const expected_v = expected.get(subgroup_id) ?? 0;
+      if (expected_v !== res) {
+        return new Error(`Invocation ${inv}:
+- expected: ${expected_v}
+-      got: ${res}`);
+      }
+    } else {
+      if (res !== kDataSentinel) {
+        return new Error(`Invocation ${inv} unexpected write:
+- subgroup invocation id: ${id}
+-          subgroup size: ${size}`);
+      }
+    }
+  }
+
+  return undefined;
+}
+
+g.test('compute,all_active')
+  .desc(`Test compute subgroupAll`)
+  .params(u =>
+    u
+      .combine('wgSize', kWGSizes)
+      .beginSubcases()
+      .combine('case', [...iterRange(kNumCases, x => x)])
+  )
+  .beforeAllSubcases(t => {
+    t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName);
+  })
+  .fn(async t => {
+    const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2];
+
+    const wgsl = `
+enable subgroups;
+
+@group(0) @binding(0)
+var<storage> inputs : array<u32>;
+
+@group(0) @binding(1)
+var<storage, read_write> outputs : array<u32>;
+
+struct Metadata {
+  subgroup_size: array<u32, ${wgThreads}>,
+  subgroup_invocation_id: array<u32, ${wgThreads}>,
+}
+
+@group(0) @binding(2)
+var<storage, read_write> metadata : Metadata;
+
+@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]})
+fn main(
+  @builtin(local_invocation_index) lid : u32,
+  @builtin(subgroup_invocation_id) id : u32,
+  @builtin(subgroup_size) subgroupSize : u32,
+) {
+  metadata.subgroup_size[lid] = subgroupSize;
+
+  metadata.subgroup_invocation_id[lid] = id;
+
+  // Record a representative subgroup id.
+  outputs[lid + ${wgThreads}] = subgroupBroadcastFirst(lid);
+
+  let res = select(0u, 1u, subgroupAll(bool(inputs[lid])));
+  outputs[lid] = res;
+}`;
+
+    const includeCounter = false;
+    const inputData = generateInputData(t.params.case, wgThreads, includeCounter);
+
+    const uintsPerOutput = 2;
+    await runComputeTest(
+      t,
+      wgsl,
+      [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]],
+      uintsPerOutput,
+      inputData,
+      (metadata: Uint32Array, output: Uint32Array) => {
+        return checkAll(metadata, output, wgThreads, inputData, (id: number, size: number) => {
+          return true;
+        });
+      }
+    );
+  });
+
+g.test('compute,split')
+  .desc('Test that only active invocation participate')
+  .params(u =>
+    u
+      .combine('predicate', keysOf(kPredicateCases))
+      .beginSubcases()
+      .combine('wgSize', kWGSizes)
+      .combine('case', [...iterRange(kNumCases, x => x)])
+  )
+  .beforeAllSubcases(t => {
+    t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName);
+  })
+  .fn(async t => {
+    const testcase = kPredicateCases[t.params.predicate];
+    const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2];
+
+    const wgsl = `
+enable subgroups;
+
+@group(0) @binding(0)
+var<storage> inputs : array<u32>;
+
+@group(0) @binding(1)
+var<storage, read_write> outputs : array<u32>;
+
+struct Metadata {
+  subgroup_size : array<u32, ${wgThreads}>,
+  subgroup_invocation_id : array<u32, ${wgThreads}>,
+}
+
+@group(0) @binding(2)
+var<storage, read_write> metadata : Metadata;
+
+@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]})
+fn main(
+  @builtin(local_invocation_index) lid : u32,
+  @builtin(subgroup_invocation_id) id : u32,
+  @builtin(subgroup_size) subgroupSize : u32,
+) {
+  metadata.subgroup_size[lid] = subgroupSize;
+
+  // Record subgroup invocation id for this invocation.
+  metadata.subgroup_invocation_id[lid] = id;
+
+  // Record a generated subgroup id.
+  outputs[${wgThreads} + lid] = subgroupBroadcastFirst(lid);
+
+  if ${testcase.cond} {
+    outputs[lid] = select(0u, 1u, subgroupAll(bool(inputs[lid])));
+  } else {
+    return;
+  }
+}`;
+
+    const includeCounter = false;
+    const inputData = generateInputData(t.params.case, wgThreads, includeCounter);
+
+    const uintsPerOutput = 2;
+    await runComputeTest(
+      t,
+      wgsl,
+      [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]],
+      uintsPerOutput,
+      inputData,
+      (metadata: Uint32Array, output: Uint32Array) => {
+        return checkAll(metadata, output, wgThreads, inputData, testcase.filter);
+      }
+    );
+  });
+
+g.test('fragment').unimplemented();
diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupAny.spec.ts
@@ -0,0 +1,265 @@
+export const description = `
+Execution tests for subgroupAny.
+
+Note: There is a lack of portability for non-uniform execution so these tests
+restrict themselves to uniform control flow.
+Note: There is no guaranteed mapping between subgroup_invocation_id and
+local_invocation_index. Tests should avoid assuming there is.
+`;
+
+import { makeTestGroup } from '../../../../../../common/framework/test_group.js';
+import { keysOf } from '../../../../../../common/util/data_tables.js';
+import { iterRange } from '../../../../../../common/util/util.js';
+import { PRNG } from '../../../../../util/prng.js';
+
+import {
+  kWGSizes,
+  kPredicateCases,
+  SubgroupTest,
+  runComputeTest,
+  kDataSentinel,
+} from './subgroup_util.js';
+
+export const g = makeTestGroup(SubgroupTest);
+
+const kNumCases = 15;
+
+/**
+ * Generate input data for testing.
+ *
+ * Data is generated in the following categories:
+ * Seed 0 generates all 0 data
+ * Seed 1 generates all 1 data
+ * Seeds 2-9 generates all 0s except for a one randomly once per 32 elements
+ * Seeds 10+ generate all random data
+ * @param seed The seed for the PRNG
+ * @param num The number of data items to generate
+ * @param addCounter If true, treats the first index as an atomic counter
+ */
+function generateInputData(seed: number, num: number, addCounter: boolean): Uint32Array {
+  const prng = new PRNG(seed);
+
+  const bound = Math.min(num, 32);
+  const index = prng.uniformInt(bound);
+
+  return new Uint32Array([
+    ...iterRange(num, x => {
+      if (addCounter && x === 0) {
+        // Counter should start at 1 to avoid clear value.
+        return 1;
+      }
+
+      if (seed === 0) {
+        return 0;
+      } else if (seed === 1) {
+        return 1;
+      } else if (seed < 10) {
+        const bounded = (addCounter ? x + 1 : x) % bound;
+        return bounded === index ? 1 : 0;
+      }
+      return prng.uniformInt(2);
+    }),
+  ]);
+}
+
+/**
+ * Checks the result of a subgroupAny operation
+ *
+ * Since subgroup size depends on the pipeline compile, we calculate the expected
+ * results after execution. The shader generates a subgroup id and records it for
+ * each invocation. The check first calculates the expected result for each subgroup
+ * and then compares to the actual result for each invocation. The filter functor
+ * ensures only the correct invocations contribute to the calculation.
+ * @param metadata An array of uints:
+ *                 * first half containing subgroup sizes (from builtin value)
+ *                 * second half subgroup invocation id
+ * @param output An array of uints containing:
+ *               * first half is the outputs of subgroupAny
+ *               * second half is a generated subgroup id
+ * @param numInvs Number of invocations executed
+ * @param input The input data (equal size to output)
+ * @param filter A functor to filter active invocations
+ */
+function checkAny(
+  metadata: Uint32Array, // unused
+  output: Uint32Array,
+  numInvs: number,
+  input: Uint32Array,
+  filter: (id: number, size: number) => boolean
+): Error | undefined {
+  // First, generate expected results.
+  const expected = new Map<number, number>();
+  for (let inv = 0; inv < numInvs; inv++) {
+    const size = metadata[inv];
+    const id = metadata[inv + numInvs];
+    if (!filter(id, size)) {
+      continue;
+    }
+    const subgroup_id = output[numInvs + inv];
+    let v = expected.get(subgroup_id) ?? 0;
+    v |= input[inv];
+    expected.set(subgroup_id, v);
+  }
+
+  // Second, check against actual results.
+  for (let inv = 0; inv < numInvs; inv++) {
+    const size = metadata[inv];
+    const id = metadata[inv + numInvs];
+    const res = output[inv];
+    if (filter(id, size)) {
+      const subgroup_id = output[numInvs + inv];
+      const expected_v = expected.get(subgroup_id) ?? 0;
+      if (expected_v !== res) {
+        return new Error(`Invocation ${inv}:
+- expected: ${expected_v}
+-      got: ${res}`);
+      }
+    } else {
+      if (res !== kDataSentinel) {
+        return new Error(`Invocation ${inv} unexpected write:
+- subgroup invocation id: ${id}
+-          subgroup size: ${size}`);
+      }
+    }
+  }
+
+  return undefined;
+}
+
+g.test('compute,all_active')
+  .desc(`Test compute subgroupAny`)
+  .params(u =>
+    u
+      .combine('wgSize', kWGSizes)
+      .beginSubcases()
+      .combine('case', [...iterRange(kNumCases, x => x)])
+  )
+  .beforeAllSubcases(t => {
+    t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName);
+  })
+  .fn(async t => {
+    const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2];
+
+    const wgsl = `
+enable subgroups;
+
+@group(0) @binding(0)
+var<storage> inputs : array<u32>;
+
+@group(0) @binding(1)
+var<storage, read_write> outputs : array<u32>;
+
+struct Metadata {
+  subgroup_size: array<u32, ${wgThreads}>,
+  subgroup_invocation_id: array<u32, ${wgThreads}>,
+}
+
+@group(0) @binding(2)
+var<storage, read_write> metadata : Metadata;
+
+@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]})
+fn main(
+  @builtin(local_invocation_index) lid : u32,
+  @builtin(subgroup_invocation_id) id : u32,
+  @builtin(subgroup_size) subgroupSize : u32,
+) {
+  metadata.subgroup_size[lid] = subgroupSize;
+
+  metadata.subgroup_invocation_id[lid] = id;
+
+  // Record a representative subgroup id.
+  outputs[lid + ${wgThreads}] = subgroupBroadcastFirst(lid);
+
+  let res = select(0u, 1u, subgroupAny(bool(inputs[lid])));
+  outputs[lid] = res;
+}`;
+
+    const includeCounter = false;
+    const inputData = generateInputData(t.params.case, wgThreads, includeCounter);
+
+    const uintsPerOutput = 2;
+    await runComputeTest(
+      t,
+      wgsl,
+      [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]],
+      uintsPerOutput,
+      inputData,
+      (metadata: Uint32Array, output: Uint32Array) => {
+        return checkAny(metadata, output, wgThreads, inputData, (id: number, size: number) => {
+          return true;
+        });
+      }
+    );
+  });
+
+g.test('compute,split')
+  .desc('Test that only active invocation participate')
+  .params(u =>
+    u
+      .combine('predicate', keysOf(kPredicateCases))
+      .beginSubcases()
+      .combine('wgSize', kWGSizes)
+      .combine('case', [...iterRange(kNumCases, x => x)])
+  )
+  .beforeAllSubcases(t => {
+    t.selectDeviceOrSkipTestCase('subgroups' as GPUFeatureName);
+  })
+  .fn(async t => {
+    const testcase = kPredicateCases[t.params.predicate];
+    const wgThreads = t.params.wgSize[0] * t.params.wgSize[1] * t.params.wgSize[2];
+
+    const wgsl = `
+enable subgroups;
+
+@group(0) @binding(0)
+var<storage> inputs : array<u32>;
+
+@group(0) @binding(1)
+var<storage, read_write> outputs : array<u32>;
+
+struct Metadata {
+  subgroup_size : array<u32, ${wgThreads}>,
+  subgroup_invocation_id : array<u32, ${wgThreads}>,
+}
+
+@group(0) @binding(2)
+var<storage, read_write> metadata : Metadata;
+
+@compute @workgroup_size(${t.params.wgSize[0]}, ${t.params.wgSize[1]}, ${t.params.wgSize[2]})
+fn main(
+  @builtin(local_invocation_index) lid : u32,
+  @builtin(subgroup_invocation_id) id : u32,
+  @builtin(subgroup_size) subgroupSize : u32,
+) {
+  metadata.subgroup_size[lid] = subgroupSize;
+
+  // Record subgroup invocation id for this invocation.
+  metadata.subgroup_invocation_id[lid] = id;
+
+  // Record a generated subgroup id.
+  outputs[${wgThreads} + lid] = subgroupBroadcastFirst(lid);
+
+  if ${testcase.cond} {
+    outputs[lid] = select(0u, 1u, subgroupAny(bool(inputs[lid])));
+  } else {
+    return;
+  }
+}`;
+
+    const includeCounter = false;
+    const inputData = generateInputData(t.params.case, wgThreads, includeCounter);
+
+    const uintsPerOutput = 2;
+    await runComputeTest(
+      t,
+      wgsl,
+      [t.params.wgSize[0], t.params.wgSize[1], t.params.wgSize[2]],
+      uintsPerOutput,
+      inputData,
+      (metadata: Uint32Array, output: Uint32Array) => {
+        return checkAny(metadata, output, wgThreads, inputData, testcase.filter);
+      }
+    );
+  });
+
+g.test('fragment').unimplemented();
diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroupBroadcast.spec.ts
@@ -318,6 +318,4 @@ fn main(@builtin(subgroup_invocation_id) id : u32,
     t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expect));
   });
 
-g.test('dynamically_uniform_id').unimplemented();
-
 g.test('fragment').unimplemented();
diff --git a/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts b/src/webgpu/shader/execution/expression/call/builtin/subgroup_util.ts
@@ -1,10 +1,12 @@
 import { assert, iterRange } from '../../../../../../common/util/util.js';
 import { Float16Array } from '../../../../../../external/petamoriken/float16/float16.js';
-import { GPUTest } from '../../../../../gpu_test.js';
+import { GPUTest, TextureTestMixin } from '../../../../../gpu_test.js';
 import { FPInterval } from '../../../../../util/floating_point.js';
 import { sparseScalarF16Range, sparseScalarF32Range } from '../../../../../util/math.js';
 import { PRNG } from '../../../../../util/prng.js';
 
+export class SubgroupTest extends TextureTestMixin(GPUTest) {}
+
 export const kNumCases = 1000;
 export const kStride = 128;
 
@@ -300,6 +302,8 @@ fn main(
   t.expectOK(checkAccuracy(metadata, output, [idx1, idx2], [val1, val2], identity, intervalGen));
 }
 
+export const kDataSentinel = 999;
+
 /**
  * Runs compute shader subgroup test
  *
@@ -346,14 +350,14 @@ export async function runComputeTest(
 
   const outputUints = outputUintsPerElement * wgThreads;
   const outputBuffer = t.makeBufferWithContents(
-    new Uint32Array([...iterRange(outputUints, x => 999)]),
+    new Uint32Array([...iterRange(outputUints, x => kDataSentinel)]),
     GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE
   );
   t.trackForCleanup(outputBuffer);
 
   const numMetadata = 2 * wgThreads;
   const metadataBuffer = t.makeBufferWithContents(
-    new Uint32Array([...iterRange(numMetadata, x => 999)]),
+    new Uint32Array([...iterRange(numMetadata, x => kDataSentinel)]),
     GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE
   );