diff --git a/webgpu/lessons/webgpu-optimizating.md b/webgpu/lessons/webgpu-optimizating.md index 1711d2f7..a42b1100 100644 --- a/webgpu/lessons/webgpu-optimizating.md +++ b/webgpu/lessons/webgpu-optimizating.md @@ -35,9 +35,207 @@ we've done the following steps Let's make an example we can optimize -* Pack your vertices -* Use mappedOnCreation for initial data -* Split uniform buffer (shared, material, per model) +## Use mappedOnCreation for initial data + +In the example above, and in most of the examples on this site we've +used `writeBuffer` to copy data into a vertex or index buffer. As a very +minor optimization, for this particular case, when you create a buffer +you can pass in `mappedAtCreation: true`. This has 2 benefits. + +1. It's slightly faster to put the data into the new buffer (2) + +2. You don't have to add `GPUBufferUsage.COPY_DST` to the buffer's usage. + + This assumes you're not going to change the data later. + +```js + function createBufferWithData(device, data, usage) { + const buffer = device.createBuffer({ + size: data.byteLength, +- usage: usage | GPUBufferUsage.COPY_DST, ++ usage: usage, ++ mappedAtCreation: true, + }); +- device.queue.writeBuffer(buffer, 0, data); ++ const dst = new Uint8Array(buffer.getMappedRange()); ++ dst.set(new Uint8Array(data.buffer)); ++ buffer.unmap(); + return buffer; + } +``` + +Note that this optimization only helps at creation time so it will not +affect our performance at render time. + +## Pack and interleave your vertices + +In the example above we have 3 buffers, one for position, one for normals, +and one for texture coordinates. This is slower both on the CPU and GPU. +One the CPU in JavaScript we need to call `setVertexBuffer` once for each +buffer for each model we want to draw. On the GPU there are cache issues. +So, if we interleave the vertex data into a single buffer we'll only need +one call to `setVertexBuffer` and we'll help the GPU as well as all the +data needed for a single vertex will be located together in memory. + +```js +- const positions = new Float32Array([1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, -1]); +- const normals = new Float32Array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1]); +- const texcoords = new Float32Array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]); ++ const vertexData = new Float32Array([ ++ // position normal texcoord ++ 1, 1, -1, 1, 0, 0, 1, 0, ++ 1, 1, 1, 1, 0, 0, 0, 0, ++ 1, -1, 1, 1, 0, 0, 0, 1, ++ 1, -1, -1, 1, 0, 0, 1, 1, ++ -1, 1, 1, -1, 0, 0, 1, 0, ++ -1, 1, -1, -1, 0, 0, 0, 0, ++ -1, -1, -1, -1, 0, 0, 0, 1, ++ -1, -1, 1, -1, 0, 0, 1, 1, ++ -1, 1, 1, 0, 1, 0, 1, 0, ++ 1, 1, 1, 0, 1, 0, 0, 0, ++ 1, 1, -1, 0, 1, 0, 0, 1, ++ -1, 1, -1, 0, 1, 0, 1, 1, ++ -1, -1, -1, 0, -1, 0, 1, 0, ++ 1, -1, -1, 0, -1, 0, 0, 0, ++ 1, -1, 1, 0, -1, 0, 0, 1, ++ -1, -1, 1, 0, -1, 0, 1, 1, ++ 1, 1, 1, 0, 0, 1, 1, 0, ++ -1, 1, 1, 0, 0, 1, 0, 0, ++ -1, -1, 1, 0, 0, 1, 0, 1, ++ 1, -1, 1, 0, 0, 1, 1, 1, ++ -1, 1, -1, 0, 0, -1, 1, 0, ++ 1, 1, -1, 0, 0, -1, 0, 0, ++ 1, -1, -1, 0, 0, -1, 0, 1, ++ -1, -1, -1, 0, 0, -1, 1, 1, ++ ]); + const indices = new Uint16Array([0, 1, 2, 0, 2, 3, 4, 5, 6, 4, 6, 7, 8, 9, 10, 8, 10, 11, 12, 13, 14, 12, 14, 15, 16, 17, 18, 16, 18, 19, 20, 21, 22, 20, 22, 23]); + +- const positionBuffer = createBufferWithData(device, positions, GPUBufferUsage.VERTEX); +- const normalBuffer = createBufferWithData(device, normals, GPUBufferUsage.VERTEX); +- const texcoordBuffer = createBufferWithData(device, texcoords, GPUBufferUsage.VERTEX); ++ const vertexBuffer = createBufferWithData(device, vertexData, GPUBufferUsage.VERTEX); + const indicesBuffer = createBufferWithData(device, indices, GPUBufferUsage.INDEX); + const numVertices = indices.length; + + const pipeline = device.createRenderPipeline({ + label: 'textured model with point light w/specular highlight', + layout: 'auto', + vertex: { + module, + buffers: [ +- // position +- { +- arrayStride: 3 * 4, // 3 floats +- attributes: [ +- {shaderLocation: 0, offset: 0, format: 'float32x3'}, +- ], +- }, +- // normal +- { +- arrayStride: 3 * 4, // 3 floats +- attributes: [ +- {shaderLocation: 1, offset: 0, format: 'float32x3'}, +- ], +- }, +- // uvs +- { +- arrayStride: 2 * 4, // 2 floats +- attributes: [ +- {shaderLocation: 2, offset: 0, format: 'float32x2'}, +- ], +- }, ++ { ++ arrayStride: (3 + 3 + 2) * 4, // 8 floats ++ attributes: [ ++ {shaderLocation: 0, offset: 0 * 4, format: 'float32x3'}, // position ++ {shaderLocation: 1, offset: 3 * 4, format: 'float32x3'}, // normal ++ {shaderLocation: 2, offset: 6 * 4, format: 'float32x2'}, // texcoord ++ ], ++ }, + ], + }, + fragment: { + module, + targets: [{ format: presentationFormat }], + }, + primitive: { + cullMode: 'back', + }, + depthStencil: { + depthWriteEnabled: true, + depthCompare: 'less', + format: 'depth24plus', + }, + }); + + ... +- pass.setVertexBuffer(0, positionBuffer); +- pass.setVertexBuffer(1, normalBuffer); +- pass.setVertexBuffer(2, texcoordBuffer); ++ pass.setVertexBuffer(0, vertexBuffer); +``` + +* Split uniform buffers (shared, material, per model) + +Our example right now has one uniform buffer object. + +```wgsl +struct Uniforms { + normalMatrix: mat3x3f, + viewProjection: mat4x4f, + world: mat4x4f, + color: vec4f, + lightWorldPosition: vec3f, + viewWorldPosition: vec3f, + shininess: f32, +}; +``` + +Some of those uniform values like `viewProjection`, `lightWorldPosition` +and `viewWorldPosition` can be shared. + +We can split these into at least 2 uniform buffers. One for the shared +values and one for *per object values*. + +```wgsl +struct SharedUniforms { + viewProjection: mat4x4f, + lightWorldPosition: vec3f, + viewWorldPosition: vec3f, +}; +struct PerObjectUniforms { + normalMatrix: mat3x3f, + world: mat4x4f, + color: vec4f, + shininess: f32, +}; +``` + +With this change, we'll save having to copy the `viewProjection`, `lightWorldPosition` and `viewWorldPosition` to every uniform buffer. +We'll also copy less data with `device.queue.writeBuffer` + +With that change our math portion dropped ~30% + +A common organization in a 3D library is to have "models" (the vertex data), +"materials" (the colors, shininess, and texture), "lights" (which lights to use), +"viewInfo" (the view and projection matrix). In particular, in our example, +`color` and `shininess` never change so it's a waste to keep copying them +to the uniform buffer every frame. + +## Double buffer uniform buffers that are updated every frame + +WebGPU is required to make accessing a buffer to be safe. That means +when submit a command buffer, WebGPU has to effectively check, "is this buffer +being updated? If so wait until the update is finished". Or, going the other way, +let's say you call `device.queue.writeBuffer`. WebGPU has to check "is this buffer currently being read by shaders? If so wait until that finishes". + +Double buffering in this case means, instead of one uniform buffer for +the "per object uniforms", the ones we're updating with thee world and +normal matrices, we'd have two. We'd ping-pong which one we're updating. +This why, while WebGPU is drawing using one of those 2 buffers, we'r updating +the other. So, WebGPU never has to wait. + + * Texture Atlas or 2D-array * GPU Occlusion culling diff --git a/webgpu/webgpu-optimization-all.html b/webgpu/webgpu-optimization-all.html new file mode 100644 index 00000000..44309670 --- /dev/null +++ b/webgpu/webgpu-optimization-all.html @@ -0,0 +1,588 @@ + + +
+ + +