From a12b71adfcf80f1d16e4a2e60a1e0eb44e14810f Mon Sep 17 00:00:00 2001 From: Gregg Tavares Date: Sat, 15 Jun 2024 15:25:10 -0700 Subject: [PATCH] wip --- webgpu/lessons/webgpu-from-webgl.md | 12 +- webgpu/lessons/webgpu-optimization.md | 103 ++- ...imization-none-uniform-buffers-4kplus.html | 479 -------------- ...-optimization-none-uniform-buffers-8k.html | 479 -------------- ...bgl-optimization-none-uniform-buffers.html | 19 +- webgpu/webgl-optimization-none.html | 19 +- webgpu/webgpu-optimization-none-4kplus.html | 586 ------------------ ...6-use-mapped-buffers-dyanmic-offsets.html} | 98 ++- 8 files changed, 187 insertions(+), 1608 deletions(-) delete mode 100644 webgpu/webgl-optimization-none-uniform-buffers-4kplus.html delete mode 100644 webgpu/webgl-optimization-none-uniform-buffers-8k.html delete mode 100644 webgpu/webgpu-optimization-none-4kplus.html rename webgpu/{webgpu-optimization-step7-double-buffer-typedarray-set-count-100.html => webgpu-optimization-step6-use-mapped-buffers-dyanmic-offsets.html} (89%) diff --git a/webgpu/lessons/webgpu-from-webgl.md b/webgpu/lessons/webgpu-from-webgl.md index 55340f55..a9801fdb 100644 --- a/webgpu/lessons/webgpu-from-webgl.md +++ b/webgpu/lessons/webgpu-from-webgl.md @@ -1417,10 +1417,16 @@ way you organize data and optimize how you draw. See [this article on WebGPU optimization](webgpu-optimization.html) for ideas. +Note: If you are comparing WebGL to WebGPU in [the article on optimization](webgpu-optimization.html) +here are 2 WebGL samples you can use to compare + +* [Drawing up to 20000 objects in WebGL using standard WebGL uniforms](../webgl-optimization-none.html) +* [Drawing up to 20000 objects in WebGL using uniform blocks](../webgl-optimization-none-uniform-buffers.html) + +Another article, if you're comparing performance of WebGL vs WebGPU see +[this article](https://toji.dev/webgpu-best-practices/webgl-performance-comparison). + --- If you were already familiar with WebGL then I hope this article was useful. -If you're comparing performance of WebGL vs WebGPU see -[this article](https://toji.dev/webgpu-best-practices/webgl-performance-comparison) to make sure you are comparing similar things. - diff --git a/webgpu/lessons/webgpu-optimization.md b/webgpu/lessons/webgpu-optimization.md index fcc192de..4c554731 100644 --- a/webgpu/lessons/webgpu-optimization.md +++ b/webgpu/lessons/webgpu-optimization.md @@ -2009,15 +2009,98 @@ which is almost 60% more than we started with. {{{example url="../webgpu-optimization-step6-use-mapped-buffers.html"}}} -* double buffer? -* Draw math with offset -* Directly map the uniform buffer. -* Use dynamic offsets -* Texture Atlas or 2D-array -* GPU Occlusion culling -* GPU Scene graph matrix calculation -* GPU Frustum culling -* Indirect Drawing -* Render Bundles +Other things that *might* help + +* **Double buffer the large uniform buffer** + + This comes up as a possible optimization because WebGPU, if a buffer is currently + in use, WebGPU can't update it. + + So, imagine you start rendering (you call `device.queue.submit`). The GPU starts + rendering using our large uniform buffer. You immediately try to update that buffer. + In this case, WebGPU would have to pause and wait for the GPU to finish using the + buffer for rendering. + + This is unlikely to happen in our example above. We don't directly update the + uniform buffer. Instead we update a transfer buffer and then later, ask the GPU + to copy it to the uniform buffer. + + This issue would be more likely to come up if we update a buffer directly on the + GPU using a compute shader. + +* **Compute matrix math with offsets** + + The math library we created in [the series on matrix math](webgpu-matrix-math.html) + Generates `Float32Array`s as outputs and takes in `Float32Array`s as inputs. + It can modify a `Float32Array` in place. But, what it can't do is update a + `Float32Array` at some offset. + + This is why, in our loop where we update our per object uniform values, + for each object we have to create 2 `Float32Array` views into our mapped + buffer. For 10000 objects that's creating 20000 of these temporary views. + + Adding offsets to every input would make them burdensome to use in my opinion + but, just as a test, I wrote a modified version of the math functions that + take an offset. In other words. + + ```js + mat4.multiply(a, b, dst); + ``` + + becomes + + ```js + mat4.multiply(a, aOffset, b, bOffset, dst, dstOffset); + ``` + + [It appears to be about 7% faster to use the offsets](../webgpu-optimization-step6-use-mapped-buffers-math-w-offsets.html). + + It's up to you if you feel that's worth it. For me personally, I'd prefer to keep it simple to use. + I'm rarely trying to draw 10000 things but it's good to know, if I wanted to squeeze out more performance, + this is one place I might find some. + +* **Directly map the uniform buffer** + + In our example above we map a transfer buffer, a buffer that only has `COPY_SRC` and `MAP_WRITE` + usage flags. We then have to call `encoder.copyBufferToBuffer` to copy the contents into the + actual uniform buffer. + + It would be much nicer if we could directly map the uniform buffer and avoid the copy. + Unfortunately, that's ability is not available in WebGPU version 1 but it is being + considered as an optional feature sometime in the future. + +* **Indirect Drawing** + + Indirect drawing refers to draw commands that take their input from a GPU buffer. + + ```js + pass.draw(vertexCount, instanceCount, firstVertex, firstInstance); // direct + pass.drawIndirect(someBuffer, offsetIntoSomeBuffer); // indirect + ``` + + In the indirect case above, `someBuffer` is a 16 byte portion of a buffer that holds + `[vertexCount, instanceCount, firstVertex, firstInstance]`. + + The advantage to indirect draw is that can have the GPU itself, fill out the values. + You can even have the GPU set `vertexCount` and/or `instanceCount` to zero when you + don't want that thing to be drawn. + + Using indirect drawing, you could do things like, for example, passing all of the + object's bounding box or bounding sphere to the GPU and then have the GPU do + frustum culling and if the object is inside the frustum it would update that + object's indirect drawing parameter to be drawn, otherwise it would update them + to not be drawn. "frustum culling" is a fancy way to say "check if the object + is possibly inside the frustum of the camera. We talked about frustums in + [the article on perspective projection](webgpu-persective-projection.html). + +* **Render Bundles** + + Render bundles let you pre-record a bunch of command buffer commands and then + request them to be executed later. This can be useful, especially if your scene + is relatively static, meaning you don't need to add or remove objects later. + + There's a great article [here](https://toji.dev/webgpu-best-practices/render-bundles) + that combines render bundles, indirect draws, GPU frustum culling, to show + some ideas for getting more speed in specialized situations. diff --git a/webgpu/webgl-optimization-none-uniform-buffers-4kplus.html b/webgpu/webgl-optimization-none-uniform-buffers-4kplus.html deleted file mode 100644 index 414d9940..00000000 --- a/webgpu/webgl-optimization-none-uniform-buffers-4kplus.html +++ /dev/null @@ -1,479 +0,0 @@ - - - - - - WebGL Optimization - Uniform Blocks 4kplus - - - - -

-  
-  
-
diff --git a/webgpu/webgl-optimization-none-uniform-buffers-8k.html b/webgpu/webgl-optimization-none-uniform-buffers-8k.html
deleted file mode 100644
index 99102396..00000000
--- a/webgpu/webgl-optimization-none-uniform-buffers-8k.html
+++ /dev/null
@@ -1,479 +0,0 @@
-
-
-  
-    
-    
-    WebGL Optimization - Uniform Blocks 8k
-    
-  
-  
-    
-    

-  
-  
-
diff --git a/webgpu/webgl-optimization-none-uniform-buffers.html b/webgpu/webgl-optimization-none-uniform-buffers.html
index 928ce78b..48b40b67 100644
--- a/webgpu/webgl-optimization-none-uniform-buffers.html
+++ b/webgpu/webgl-optimization-none-uniform-buffers.html
@@ -358,19 +358,13 @@
 
     const startTimeMs = performance.now();
 
-    let width = 1;
-    let height = 1;
-    if (settings.render) {
-      const entry = canvasToSizeMap.get(canvas);
-      if (entry) {
-        width =  entry.contentBoxSize[0].inlineSize;
-        height = entry.contentBoxSize[0].blockSize;
-      }
-    }
+    const {width, height} = canvasToSizeMap.get(canvas) ?? canvas;
+    // Don't set the canvas size if it's already that size as it may be slow.
     if (canvas.width !== width || canvas.height !== height) {
       canvas.width = width;
       canvas.height = height;
     }
+
     gl.viewport(0, 0, canvas.width, canvas.height);
 
     // Get the current texture from the canvas context and
@@ -468,7 +462,12 @@
   requestAnimationFrame(render);
 
   const observer = new ResizeObserver(entries => {
-    entries.forEach(e => canvasToSizeMap.set(e.target, e));
+    entries.forEach(entry => {
+      canvasToSizeMap.set(entry.target, {
+        width: Math.max(1, entry.contentBoxSize[0].inlineSize),
+        height: Math.max(1, entry.contentBoxSize[0].blockSize),
+      });
+    });
   });
   observer.observe(canvas);
 }
diff --git a/webgpu/webgl-optimization-none.html b/webgpu/webgl-optimization-none.html
index f78a5d11..28ff4b10 100644
--- a/webgpu/webgl-optimization-none.html
+++ b/webgpu/webgl-optimization-none.html
@@ -355,19 +355,13 @@
 
     const startTimeMs = performance.now();
 
-    let width = 1;
-    let height = 1;
-    if (settings.render) {
-      const entry = canvasToSizeMap.get(canvas);
-      if (entry) {
-        width =  entry.contentBoxSize[0].inlineSize;
-        height = entry.contentBoxSize[0].blockSize;
-      }
-    }
+    const {width, height} = canvasToSizeMap.get(canvas) ?? canvas;
+    // Don't set the canvas size if it's already that size as it may be slow.
     if (canvas.width !== width || canvas.height !== height) {
       canvas.width = width;
       canvas.height = height;
     }
+
     gl.viewport(0, 0, canvas.width, canvas.height);
 
     // Get the current texture from the canvas context and
@@ -462,7 +456,12 @@
   requestAnimationFrame(render);
 
   const observer = new ResizeObserver(entries => {
-    entries.forEach(e => canvasToSizeMap.set(e.target, e));
+    entries.forEach(entry => {
+      canvasToSizeMap.set(entry.target, {
+        width: Math.max(1, entry.contentBoxSize[0].inlineSize),
+        height: Math.max(1, entry.contentBoxSize[0].blockSize),
+      });
+    });
   });
   observer.observe(canvas);
 }
diff --git a/webgpu/webgpu-optimization-none-4kplus.html b/webgpu/webgpu-optimization-none-4kplus.html
deleted file mode 100644
index bc656c81..00000000
--- a/webgpu/webgpu-optimization-none-4kplus.html
+++ /dev/null
@@ -1,586 +0,0 @@
-
-
-  
-    
-    
-    WebGPU Optimization - none 4kplus
-    
-  
-  
-    
-    

-  
-  
-
diff --git a/webgpu/webgpu-optimization-step7-double-buffer-typedarray-set-count-100.html b/webgpu/webgpu-optimization-step6-use-mapped-buffers-dyanmic-offsets.html
similarity index 89%
rename from webgpu/webgpu-optimization-step7-double-buffer-typedarray-set-count-100.html
rename to webgpu/webgpu-optimization-step6-use-mapped-buffers-dyanmic-offsets.html
index a09a6721..a81aa00b 100644
--- a/webgpu/webgpu-optimization-step7-double-buffer-typedarray-set-count-100.html
+++ b/webgpu/webgpu-optimization-step6-use-mapped-buffers-dyanmic-offsets.html
@@ -226,6 +226,46 @@
     `,
   });
 
+  const bindGroupLayout = device.createBindGroupLayout({
+    entries: [
+      {
+        binding: 0,
+        visibility: GPUShaderStage.FRAGMENT,
+        texture: {
+          sampleType: 'float',
+          viewDimension: '2d',
+          multisampled: false,
+        },
+      },
+      {
+        binding: 1,
+        visibility: GPUShaderStage.FRAGMENT,
+        sampler: {
+          type: 'filtering',
+        },
+      },
+      {
+        binding: 2,
+        visibility: GPUShaderStage.VERTEX,
+        buffer: { hasDynamicOffset: true },
+      },
+      {
+        binding: 3,
+        visibility: GPUShaderStage.VERTEX,
+        buffer: {},
+      },
+      {
+        binding: 4,
+        visibility: GPUShaderStage.FRAGMENT,
+        buffer: {},
+      },
+    ],
+  });
+
+  const pipelineLayout = device.createPipelineLayout({
+    bindGroupLayouts: [ bindGroupLayout ],
+  });
+
   function createBufferWithData(device, data, usage) {
     const buffer = device.createBuffer({
       size: data.byteLength,
@@ -273,7 +313,7 @@
 
   const pipeline = device.createRenderPipeline({
     label: 'textured model with point light w/specular highlight',
-    layout: 'auto',
+    layout: pipelineLayout,
     vertex: {
       module,
       buffers: [
@@ -374,11 +414,11 @@
 
   const uniformBufferSize = (12 + 16) * 4;
   const uniformBufferSpace = roundUp(uniformBufferSize, device.limits.minUniformBufferOffsetAlignment);
-  const uniformBuffers = [0, 1].map(() => device.createBuffer({
+  const uniformBuffer = device.createBuffer({
     label: 'uniforms',
     size: uniformBufferSpace * maxObjects,
     usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
-  }));
+  });
 
   const mappedTransferBuffers = [];
   const getMappedTransferBuffer = () => {
@@ -394,21 +434,19 @@
   const kWorldOffset = 12;
 
   for (let i = 0; i < maxObjects; ++i) {
-    const uniformBufferOffset = i * uniformBufferSpace;
-
     const material = randomArrayElement(materials);
 
-    const bindGroups = [0, 1].map(i => device.createBindGroup({
+    const bindGroup = device.createBindGroup({
       label: 'bind group for object',
-      layout: pipeline.getBindGroupLayout(0),
+      layout: bindGroupLayout,
       entries: [
         { binding: 0, resource: material.texture.createView() },
         { binding: 1, resource: material.sampler },
-        { binding: 2, resource: { buffer: uniformBuffers[i], offset: uniformBufferOffset, size: uniformBufferSize }},
+        { binding: 2, resource: { buffer: uniformBuffer, size: uniformBufferSize }},
         { binding: 3, resource: { buffer: globalUniformBuffer }},
         { binding: 4, resource: { buffer: material.materialUniformBuffer }},
       ],
-    }));
+    });
 
     const axis = vec3.normalize([rand(-1, 1), rand(-1, 1), rand(-1, 1)]);
     const radius = rand(10, 100);
@@ -417,7 +455,7 @@
     const scale = rand(2, 10);
 
     objectInfos.push({
-      bindGroups,
+      bindGroup,
 
       axis,
       radius,
@@ -449,7 +487,7 @@
   const degToRad = d => d * Math.PI / 180;
 
   const settings = {
-    numObjects: 100,
+    numObjects: 1000,
     render: true,
   };
 
@@ -459,16 +497,11 @@
 
   let depthTexture;
   let then = 0;
-  let frameCount = 0;
-
-  const worldTemp = mat4.identity();
-  const normalMatrixTemp = mat3.identity();
 
   function render(time) {
     time *= 0.001;  // convert to seconds
     const deltaTime = time - then;
     then = time;
-    ++frameCount;
 
     const startTimeMs = performance.now();
 
@@ -517,30 +550,29 @@
       } = objectInfos[i];
       const mathTimeStartMs = performance.now();
 
+      // Make views into the mapped buffer.
       const uniformBufferOffset = i * uniformBufferSpace;
       const f32Offset = uniformBufferOffset / 4;
+      const normalMatrixValue = uniformValues.subarray(
+          f32Offset + kNormalMatrixOffset, f32Offset + kNormalMatrixOffset + 12);
+      const worldValue = uniformValues.subarray(
+          f32Offset + kWorldOffset, f32Offset + kWorldOffset + 16);
 
       // Compute a world matrix
-      mat4.identity(worldTemp);
-      mat4.axisRotate(worldTemp, axis, i + time * speed, worldTemp);
-      mat4.translate(worldTemp, [0, 0, Math.sin(i * 3.721 + time * speed) * radius], worldTemp);
-      mat4.translate(worldTemp, [0, 0, Math.sin(i * 9.721 + time * 0.1) * radius], worldTemp);
-      mat4.rotateX(worldTemp, time * rotationSpeed + i, worldTemp);
-      mat4.scale(worldTemp, [scale, scale, scale], worldTemp);
+      mat4.identity(worldValue);
+      mat4.axisRotate(worldValue, axis, i + time * speed, worldValue);
+      mat4.translate(worldValue, [0, 0, Math.sin(i * 3.721 + time * speed) * radius], worldValue);
+      mat4.translate(worldValue, [0, 0, Math.sin(i * 9.721 + time * 0.1) * radius], worldValue);
+      mat4.rotateX(worldValue, time * rotationSpeed + i, worldValue);
+      mat4.scale(worldValue, [scale, scale, scale], worldValue);
 
       // Inverse and transpose it into the normalMatrix value
-      mat3.fromMat4(mat4.transpose(mat4.inverse(worldTemp)), normalMatrixTemp);
-
-      uniformValues.set(worldTemp, f32Offset + kWorldOffset);
-      uniformValues.set(normalMatrixTemp, f32Offset + kNormalMatrixOffset);
+      mat3.fromMat4(mat4.transpose(mat4.inverse(worldValue)), normalMatrixValue);
 
       mathElapsedTimeMs += performance.now() - mathTimeStartMs;
     }
     transferBuffer.unmap();
 
-    const resourceIndex = frameCount % 2;
-    const uniformBuffer = uniformBuffers[resourceIndex];
-
     // copy the uniform values from the transfer buffer to the uniform buffer
     if (settings.numObjects) {
       const size = (settings.numObjects - 1) * uniformBufferSpace + uniformBufferSize;
@@ -575,9 +607,13 @@
     pass.setVertexBuffer(0, vertexBuffer);
     pass.setIndexBuffer(indicesBuffer, 'uint16');
 
+    const offsets = new Uint32Array(1);
     for (let i = 0; i < settings.numObjects; ++i) {
-      const { bindGroups } = objectInfos[i];
-      pass.setBindGroup(0, bindGroups[resourceIndex]);
+      const uniformBufferOffset = i * uniformBufferSpace;
+      //offset: uniformBufferOffset, size: uniformBufferSize
+      offsets[0] = uniformBufferOffset;
+      const { bindGroup } = objectInfos[i];
+      pass.setBindGroup(0, bindGroup, offsets);
       pass.drawIndexed(numVertices);
     }