diff --git a/src/webgpu/shader/execution/memory_model/memory_model_setup.ts b/src/webgpu/shader/execution/memory_model/memory_model_setup.ts index f8e5b9034cb4..26183aff611f 100644 --- a/src/webgpu/shader/execution/memory_model/memory_model_setup.ts +++ b/src/webgpu/shader/execution/memory_model/memory_model_setup.ts @@ -1,5 +1,6 @@ import { GPUTest } from '../../../gpu_test'; import { checkElementsPassPredicate } from '../../../util/check_contents.js'; +import { align } from '../../../util/math.js'; /* All buffer sizes are counted in units of 4-byte words. */ @@ -76,12 +77,23 @@ const numReadOutputs = 2; type BufferWithSource = { /** Buffer used by shader code. */ deviceBuf: GPUBuffer; - /** Buffer populated from the host size, data is copied to device buffer for use by shader. */ + /** Buffer populated from the host side, data is copied to device buffer for use by shader. */ srcBuf: GPUBuffer; /** Size in bytes of the buffer. */ size: number; }; +type SubBufferWithSource = { + /** Buffer used by shader code. This buffer is shared for multiple used */ + deviceBuf: GPUBuffer; + /** Buffer populated from the host side, data is copied to device buffer for use by shader. */ + srcBuf: GPUBuffer; + /** Size in bytes of this portion of the buffer. */ + size: number; + /** Offset in bytes of this portion of the buffer */ + offset: number; +}; + /** Specifies the buffers used during a memory model test. */ type MemoryModelBuffers = { /** This is the memory region that testing threads read from and write to. */ @@ -128,11 +140,11 @@ const bytesPerWord = 4; * - enable directives, if necessary * - the type alias for AccessValueType */ -function shaderPreamble(accessValueType: AccessValueType): string { +function shaderPreamble(accessValueType: AccessValueType, constants: string): string { if (accessValueType === 'f16') { - return 'enable f16;\nalias AccessValueTy = f16;\n'; + return `enable f16;\nalias AccessValueTy = f16;\n${constants}\n`; } - return `alias AccessValueTy = ${accessValueType};\n`; + return `alias AccessValueTy = ${accessValueType};\n${constants}\n`; } /** @@ -191,19 +203,28 @@ export class MemoryModelTester { this.test = t; this.params = params; - testShader = shaderPreamble(accessValueType) + testShader; - resultShader = shaderPreamble(accessValueType) + resultShader; + const workgroupXSize = Math.min(params.workgroupSize, t.device.limits.maxComputeWorkgroupSizeX); + const constants = ` + const kNumBarriers = 1u; // MAINTENANCE_TODO: make barrier not an array + const kMaxWorkgroups = ${params.maxWorkgroups}u; + const kScratchMemorySize = ${params.scratchMemorySize}u; + const kWorkgroupXSize = ${workgroupXSize}u; + `; + testShader = shaderPreamble(accessValueType, constants) + testShader; + resultShader = shaderPreamble(accessValueType, constants) + resultShader; // set up buffers - const testingThreads = this.params.workgroupSize * this.params.testingWorkgroups; + const testingThreads = workgroupXSize * this.params.testingWorkgroups; const testLocationsSize = testingThreads * numMemLocations * this.params.memStride * bytesPerWord; const testLocationsBuffer: BufferWithSource = { deviceBuf: this.test.device.createBuffer({ + label: 'testLocationsBuffer', size: testLocationsSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, }), srcBuf: this.test.device.createBuffer({ + label: 'testLocationsSrcBuf', size: testLocationsSize, usage: GPUBufferUsage.COPY_SRC, }), @@ -213,10 +234,12 @@ export class MemoryModelTester { const readResultsSize = testingThreads * numReadOutputs * bytesPerWord; const readResultsBuffer: BufferWithSource = { deviceBuf: this.test.device.createBuffer({ + label: 'readResultsBuffer', size: readResultsSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, }), srcBuf: this.test.device.createBuffer({ + label: 'readResultsSrcBuf', size: readResultsSize, usage: GPUBufferUsage.COPY_SRC, }), @@ -226,10 +249,12 @@ export class MemoryModelTester { const testResultsSize = this.params.numBehaviors * bytesPerWord; const testResultsBuffer: BufferWithSource = { deviceBuf: this.test.device.createBuffer({ + label: 'testResultsBuffer', size: testResultsSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC, }), srcBuf: this.test.device.createBuffer({ + label: 'testResultsSrcBuffer', size: testResultsSize, usage: GPUBufferUsage.COPY_SRC, }), @@ -249,52 +274,67 @@ export class MemoryModelTester { size: shuffledWorkgroupsSize, }; - const barrierSize = bytesPerWord; - const barrierBuffer: BufferWithSource = { - deviceBuf: this.test.device.createBuffer({ - size: barrierSize, - usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, - }), + // Combine 3 arrays into 1 buffer as we need to keep the number of storage buffers to 4 for compat. + const falseSharingAvoidanceQuantum = 4096; + const barrierSize = align(bytesPerWord, falseSharingAvoidanceQuantum); + const scratchpadSize = align( + this.params.scratchMemorySize * bytesPerWord, + falseSharingAvoidanceQuantum + ); + const scratchMemoryLocationsSize = align( + this.params.maxWorkgroups * bytesPerWord, + falseSharingAvoidanceQuantum + ); + const comboSize = barrierSize + scratchpadSize + scratchMemoryLocationsSize; + + const comboBuffer = this.test.device.createBuffer({ + label: 'comboBuffer', + size: comboSize, + usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, + }); + + const barrierBuffer: SubBufferWithSource = { + deviceBuf: comboBuffer, srcBuf: this.test.device.createBuffer({ + label: 'barrierSrcBuf', size: barrierSize, usage: GPUBufferUsage.COPY_SRC, }), size: barrierSize, + offset: 0, }; - const scratchpadSize = this.params.scratchMemorySize * bytesPerWord; - const scratchpadBuffer: BufferWithSource = { - deviceBuf: this.test.device.createBuffer({ - size: scratchpadSize, - usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, - }), + const scratchpadBuffer: SubBufferWithSource = { + deviceBuf: comboBuffer, srcBuf: this.test.device.createBuffer({ + label: 'scratchpadSrcBuf', size: scratchpadSize, usage: GPUBufferUsage.COPY_SRC, }), size: scratchpadSize, + offset: barrierSize, }; - const scratchMemoryLocationsSize = this.params.maxWorkgroups * bytesPerWord; - const scratchMemoryLocationsBuffer: BufferWithSource = { - deviceBuf: this.test.device.createBuffer({ - size: scratchMemoryLocationsSize, - usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE, - }), + const scratchMemoryLocationsBuffer: SubBufferWithSource = { + deviceBuf: comboBuffer, srcBuf: this.test.device.createBuffer({ + label: 'scratchMemoryLocationsSrcBuf', size: scratchMemoryLocationsSize, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE, }), size: scratchMemoryLocationsSize, + offset: barrierSize + scratchpadSize, }; const stressParamsSize = numStressParams * bytesPerWord; const stressParamsBuffer: BufferWithSource = { deviceBuf: this.test.device.createBuffer({ + label: 'stressParamsBuffer', size: stressParamsSize, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.UNIFORM, }), srcBuf: this.test.device.createBuffer({ + label: 'stressParamsSrcBuf', size: stressParamsSize, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE, }), @@ -314,17 +354,17 @@ export class MemoryModelTester { // set up pipeline layouts const testLayout = this.test.device.createBindGroupLayout({ + label: 'testLayout', entries: [ { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'read-only-storage' } }, { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, - { binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, - { binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, - { binding: 6, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, + { binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'uniform' } }, ], }); this.testPipeline = this.test.device.createComputePipeline({ + label: 'testPipeline', layout: this.test.device.createPipelineLayout({ bindGroupLayouts: [testLayout], }), @@ -336,19 +376,19 @@ export class MemoryModelTester { }, }); this.testBindGroup = this.test.device.createBindGroup({ + label: 'testBindGroup', entries: [ { binding: 0, resource: { buffer: this.buffers.testLocations.deviceBuf } }, { binding: 1, resource: { buffer: this.buffers.readResults.deviceBuf } }, { binding: 2, resource: { buffer: this.buffers.shuffledWorkgroups.deviceBuf } }, - { binding: 3, resource: { buffer: this.buffers.barrier.deviceBuf } }, - { binding: 4, resource: { buffer: this.buffers.scratchpad.deviceBuf } }, - { binding: 5, resource: { buffer: this.buffers.scratchMemoryLocations.deviceBuf } }, - { binding: 6, resource: { buffer: this.buffers.stressParams.deviceBuf } }, + { binding: 3, resource: { buffer: comboBuffer } }, + { binding: 4, resource: { buffer: this.buffers.stressParams.deviceBuf } }, ], layout: testLayout, }); const resultLayout = this.test.device.createBindGroupLayout({ + label: 'resultLayout', entries: [ { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: 'storage' } }, @@ -357,6 +397,7 @@ export class MemoryModelTester { ], }); this.resultPipeline = this.test.device.createComputePipeline({ + label: 'resultPipeline', layout: this.test.device.createPipelineLayout({ bindGroupLayouts: [resultLayout], }), @@ -368,6 +409,7 @@ export class MemoryModelTester { }, }); this.resultBindGroup = this.test.device.createBindGroup({ + label: 'resultBindGroup', entries: [ { binding: 0, resource: { buffer: this.buffers.testLocations.deviceBuf } }, { binding: 1, resource: { buffer: this.buffers.readResults.deviceBuf } }, @@ -464,8 +506,17 @@ export class MemoryModelTester { } /** Utility method that simplifies copying source buffers to device buffers. */ - protected copyBufferToBuffer(encoder: GPUCommandEncoder, buffer: BufferWithSource): void { - encoder.copyBufferToBuffer(buffer.srcBuf, 0, buffer.deviceBuf, 0, buffer.size); + protected copyBufferToBuffer( + encoder: GPUCommandEncoder, + buffer: BufferWithSource | SubBufferWithSource + ): void { + encoder.copyBufferToBuffer( + buffer.srcBuf, + 0, + buffer.deviceBuf, + (buffer as SubBufferWithSource).offset || 0, + buffer.size + ); } /** Returns a random integer between 0 and the max. */ @@ -626,7 +677,19 @@ const shaderMemStructures = ` }; struct IndexMemory { - value: array + value: array, + }; + + struct AtomicMemoryBarrier { + value: array, kNumBarriers> + }; + + struct IndexMemoryScratchpad { + value: array, + }; + + struct IndexMemoryScratchLocations { + value: array, }; struct ReadResult { @@ -635,7 +698,14 @@ const shaderMemStructures = ` }; struct ReadResults { - value: array + value: array, + }; + + // These arrays are combine into 1 buffer because compat mode only supports 4 storage buffers by default. + struct CombinedData { + barrier: AtomicMemoryBarrier, + scratchpad: IndexMemoryScratchpad, + scratch_locations: IndexMemoryScratchLocations, }; struct StressParamsMemory { @@ -687,10 +757,8 @@ const twoBehaviorTestResultStructure = ` const commonTestShaderBindings = ` @group(0) @binding(1) var results : ReadResults; @group(0) @binding(2) var shuffled_workgroups : IndexMemory; - @group(0) @binding(3) var barrier : AtomicMemory; - @group(0) @binding(4) var scratchpad : IndexMemory; - @group(0) @binding(5) var scratch_locations : IndexMemory; - @group(0) @binding(6) var stress_params : StressParamsMemory; + @group(0) @binding(3) var combo : CombinedData; + @group(0) @binding(4) var stress_params : StressParamsMemory; `; /** The combined bindings for a test on atomic memory. */ @@ -758,12 +826,12 @@ const testShaderFunctions = ` // the barrier but does not overly reduce testing throughput. fn spin(limit: u32) { var i : u32 = 0u; - var bar_val : u32 = atomicAdd(&barrier.value[0], 1u); + var bar_val : u32 = atomicAdd(&combo.barrier.value[0], 1u); loop { if (i == 1024u || bar_val >= limit) { break; } - bar_val = atomicAdd(&barrier.value[0], 0u); + bar_val = atomicAdd(&combo.barrier.value[0], 0u); i = i + 1u; } } @@ -773,44 +841,44 @@ const testShaderFunctions = ` // the compiler optimizing out unused loads, where 100,000 is larger than the maximum number of stress iterations used // in any test. fn do_stress(iterations: u32, pattern: u32, workgroup_id: u32) { - let addr = scratch_locations.value[workgroup_id]; + let addr = combo.scratch_locations.value[workgroup_id]; switch(pattern) { case 0u: { for(var i: u32 = 0u; i < iterations; i = i + 1u) { - scratchpad.value[addr] = i; - scratchpad.value[addr] = i + 1u; + combo.scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i + 1u; } } case 1u: { for(var i: u32 = 0u; i < iterations; i = i + 1u) { - scratchpad.value[addr] = i; - let tmp1: u32 = scratchpad.value[addr]; + combo.scratchpad.value[addr] = i; + let tmp1: u32 = combo.scratchpad.value[addr]; if (tmp1 > 100000u) { - scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i; break; } } } case 2u: { for(var i: u32 = 0u; i < iterations; i = i + 1u) { - let tmp1: u32 = scratchpad.value[addr]; + let tmp1: u32 = combo.scratchpad.value[addr]; if (tmp1 > 100000u) { - scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i; break; } - scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i; } } case 3u: { for(var i: u32 = 0u; i < iterations; i = i + 1u) { - let tmp1: u32 = scratchpad.value[addr]; + let tmp1: u32 = combo.scratchpad.value[addr]; if (tmp1 > 100000u) { - scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i; break; } - let tmp2: u32 = scratchpad.value[addr]; + let tmp2: u32 = combo.scratchpad.value[addr]; if (tmp2 > 100000u) { - scratchpad.value[addr] = i; + combo.scratchpad.value[addr] = i; break; } } @@ -827,7 +895,7 @@ const testShaderFunctions = ` */ const shaderEntryPoint = ` // Change to pipeline overridable constant when possible. - const workgroupXSize = 256u; + const workgroupXSize = kWorkgroupXSize; @compute @workgroup_size(workgroupXSize) fn main( @builtin(local_invocation_id) local_invocation_id : vec3, @builtin(workgroup_id) workgroup_id : vec3) {