From 6fa114c020f8656947e283bdf4bed96fe7a54c70 Mon Sep 17 00:00:00 2001 From: Daniel McNab <36049421+DJMcNab@users.noreply.github.com> Date: Thu, 22 Feb 2024 11:06:02 +0000 Subject: [PATCH] Allow initialising shaders in parallel (#455) * Initialise shaders in parallel * Add control and visibility over initialisation * Clarify comment * Fix wasm compilation * Fix inverted name * Allow configuring the number of threads directly * Fix dodgy maths --- crates/tests/src/lib.rs | 3 +- examples/headless/src/main.rs | 2 + examples/scenes/src/svg.rs | 3 +- examples/with_bevy/src/main.rs | 4 + examples/with_winit/src/lib.rs | 17 ++++ src/lib.rs | 22 +++++ src/wgpu_engine.rs | 165 ++++++++++++++++++++++++++++----- 7 files changed, 190 insertions(+), 26 deletions(-) diff --git a/crates/tests/src/lib.rs b/crates/tests/src/lib.rs index faf7b5149..3cceefcc6 100644 --- a/crates/tests/src/lib.rs +++ b/crates/tests/src/lib.rs @@ -1,4 +1,4 @@ -use std::{env, fs::File, path::Path, sync::Arc}; +use std::{env, fs::File, num::NonZeroUsize, path::Path, sync::Arc}; use anyhow::{anyhow, bail, Result}; use vello::{ @@ -62,6 +62,7 @@ pub async fn render(scene: Scene, params: &TestParams) -> Result { RendererOptions { surface_format: None, use_cpu: params.use_cpu, + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs index 65587c4df..add53ba83 100644 --- a/examples/headless/src/main.rs +++ b/examples/headless/src/main.rs @@ -1,5 +1,6 @@ use std::{ fs::File, + num::NonZeroUsize, path::{Path, PathBuf}, }; @@ -90,6 +91,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> { RendererOptions { surface_format: None, use_cpu: args.use_cpu, + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), }, ) diff --git a/examples/scenes/src/svg.rs b/examples/scenes/src/svg.rs index b4ca6c7a6..3155bc9d8 100644 --- a/examples/scenes/src/svg.rs +++ b/examples/scenes/src/svg.rs @@ -76,7 +76,8 @@ fn example_scene_of(file: PathBuf) -> ExampleScene { .unwrap_or_else(|| "unknown".to_string()); ExampleScene { function: Box::new(svg_function_of(name.clone(), move || { - std::fs::read_to_string(file).expect("failed to read svg file") + std::fs::read_to_string(&file) + .unwrap_or_else(|e| panic!("failed to read svg file {file:?}: {e}")) })), config: crate::SceneConfig { animated: false, diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs index b4760e9dd..b5c310e46 100644 --- a/examples/with_bevy/src/main.rs +++ b/examples/with_bevy/src/main.rs @@ -1,3 +1,5 @@ +use std::num::NonZeroUsize; + use bevy::render::{Render, RenderSet}; use bevy::utils::synccell::SyncCell; use vello::kurbo::{Affine, Point, Rect, Stroke}; @@ -29,6 +31,8 @@ impl FromWorld for VelloRenderer { device.wgpu_device(), RendererOptions { surface_format: None, + // TODO: We should ideally use the Bevy threadpool here + num_init_threads: NonZeroUsize::new(1), antialiasing_support: vello::AaSupport::area_only(), use_cpu: false, }, diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 00868de7e..2fc4dd731 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -15,6 +15,7 @@ // Also licensed under MIT license, at your choice. use instant::{Duration, Instant}; +use std::num::NonZeroUsize; use std::{collections::HashSet, sync::Arc}; use anyhow::Result; @@ -51,6 +52,21 @@ struct Args { #[arg(long)] /// Whether to use CPU shaders use_cpu: bool, + /// Whether to force initialising the shaders serially (rather than spawning threads) + /// This has no effect on wasm, and defaults to 1 on macOS for performance reasons + /// + /// Use `0` for an automatic choice + #[arg(long, default_value_t=default_threads())] + num_init_threads: usize, +} + +fn default_threads() -> usize { + #![allow(unreachable_code)] + #[cfg(target_os = "mac")] + { + return 1; + } + 0 } struct RenderState<'s> { @@ -538,6 +554,7 @@ fn run( surface_format: Some(render_state.surface.format), use_cpu, antialiasing_support: vello::AaSupport::all(), + num_init_threads: NonZeroUsize::new(args.num_init_threads) }, ) .expect("Could create renderer") diff --git a/src/lib.rs b/src/lib.rs index 4cde7fb52..e9676cd32 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,6 +25,8 @@ mod shaders; #[cfg(feature = "wgpu")] mod wgpu_engine; +use std::{num::NonZeroUsize, time::Instant}; + /// Styling and composition primitives. pub use peniko; /// 2D geometry, with a focus on curves. @@ -140,6 +142,16 @@ pub struct RendererOptions { /// Represents the enabled set of AA configurations. This will be used to determine which /// pipeline permutations should be compiled at startup. pub antialiasing_support: AaSupport, + + /// How many threads to use for initialisation of shaders. + /// + /// Use `Some(1)` to use a single thread. This is recommended when on macOS + /// (see https://github.com/bevyengine/bevy/pull/10812#discussion_r1496138004) + /// + /// Set to `None` to use a heuristic which will use many but not all threads + /// + /// Has no effect on WebAssembly + pub num_init_threads: Option, } #[cfg(feature = "wgpu")] @@ -147,7 +159,16 @@ impl Renderer { /// Creates a new renderer for the specified device. pub fn new(device: &Device, options: RendererOptions) -> Result { let mut engine = WgpuEngine::new(options.use_cpu); + // If we are running in parallel (i.e. the number of threads is not 1) + if options.num_init_threads != NonZeroUsize::new(1) { + #[cfg(not(target_arch = "wasm32"))] + engine.use_parallel_initialisation(); + } + let start = Instant::now(); let shaders = shaders::full_shaders(device, &mut engine, &options)?; + #[cfg(not(target_arch = "wasm32"))] + engine.build_shaders_if_needed(device, options.num_init_threads); + eprintln!("Building shaders took {:?}", start.elapsed()); let blit = options .surface_format .map(|surface_format| BlitPipeline::new(device, surface_format)); @@ -272,6 +293,7 @@ impl Renderer { pub async fn reload_shaders(&mut self, device: &Device) -> Result<()> { device.push_error_scope(wgpu::ErrorFilter::Validation); let mut engine = WgpuEngine::new(self.options.use_cpu); + // We choose not to initialise these shaders in parallel, to ensure the error scope works correctly let shaders = shaders::full_shaders(device, &mut engine, &self.options)?; let error = device.pop_error_scope().await; if let Some(error) = error { diff --git a/src/wgpu_engine.rs b/src/wgpu_engine.rs index 8a39a76e7..bb2df60bf 100644 --- a/src/wgpu_engine.rs +++ b/src/wgpu_engine.rs @@ -19,12 +19,22 @@ use crate::{ BufProxy, Command, Id, ImageProxy, Recording, ResourceProxy, ShaderId, }; +#[cfg(not(target_arch = "wasm32"))] +struct UninitialisedShader { + wgsl: Cow<'static, str>, + label: &'static str, + entries: Vec, + shader_id: ShaderId, +} + #[derive(Default)] pub struct WgpuEngine { shaders: Vec, pool: ResourcePool, bind_map: BindMap, downloads: HashMap, + #[cfg(not(target_arch = "wasm32"))] + shaders_to_initialise: Option>, pub(crate) use_cpu: bool, } @@ -62,7 +72,7 @@ impl Shader { } else if let Some(wgpu) = self.wgpu.as_ref() { ShaderKind::Wgpu(wgpu) } else { - panic!("no available shader") + panic!("no available shader for {}", self.label) } } } @@ -130,6 +140,88 @@ impl WgpuEngine { } } + /// Enable creating any remaining shaders in parallel + #[cfg(not(target_arch = "wasm32"))] + pub fn use_parallel_initialisation(&mut self) { + if self.shaders_to_initialise.is_some() { + return; + } + self.shaders_to_initialise = Some(Vec::new()); + } + + #[cfg(not(target_arch = "wasm32"))] + /// Initialise (in parallel) any shaders which are yet to be created + pub fn build_shaders_if_needed( + &mut self, + device: &Device, + num_threads: Option, + ) { + use std::num::NonZeroUsize; + + if let Some(mut new_shaders) = self.shaders_to_initialise.take() { + let num_threads = num_threads + .map(NonZeroUsize::get) + .unwrap_or_else(|| { + // Fallback onto a heuristic. This tries to not to use all threads. + // We keep the main thread blocked and not doing much whilst this is running, + // so we broadly leave two cores unused at the point of maximum parallelism + // (This choice is arbitrary, and could be tuned, although a 'proper' threadpool + // should probably be used instead) + std::thread::available_parallelism().map_or(2, |it| it.get().max(4) - 2) + }) + .min(new_shaders.len()); + eprintln!("Initialising in parallel using {num_threads} threads"); + let remainder = new_shaders.split_off(num_threads); + let (tx, rx) = std::sync::mpsc::channel::<(ShaderId, WgpuShader)>(); + + // We expect each initialisation to take much longer than acquiring a lock, so we just use a mutex for our work queue + let work_queue = std::sync::Mutex::new(remainder.into_iter()); + let work_queue = &work_queue; + std::thread::scope(|scope| { + let tx = tx; + new_shaders + .into_iter() + .map(|it| { + let tx = tx.clone(); + std::thread::Builder::new() + .name("Vello shader initialisation worker thread".into()) + .spawn_scoped(scope, move || { + let shader = Self::create_compute_pipeline( + device, it.label, it.wgsl, it.entries, + ); + // We know the rx can only be closed if all the tx references are dropped + tx.send((it.shader_id, shader)).unwrap(); + while let Ok(mut guard) = work_queue.lock() { + if let Some(value) = guard.next() { + drop(guard); + let shader = Self::create_compute_pipeline( + device, + value.label, + value.wgsl, + value.entries, + ); + tx.send((value.shader_id, shader)).unwrap(); + } else { + break; + } + } + // Another thread panicked or we finished. + // If another thread panicked, we ignore that here and finish our processing + drop(tx); + }) + .expect("failed to spawn thread"); + }) + .for_each(drop); + // Drop the initial sender, to mean that there will be no more senders if and only if all other threads have finished + drop(tx); + + while let Ok((id, value)) = rx.recv() { + self.shaders[id.0].wgpu = Some(value); + } + }); + } + } + /// Add a shader. /// /// This function is somewhat limited, it doesn't apply a label, only allows one bind group, @@ -173,10 +265,6 @@ impl WgpuEngine { } } - let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { - label: Some(label), - source: wgpu::ShaderSource::Wgsl(wgsl), - }); let entries = layout .iter() .enumerate() @@ -225,27 +313,24 @@ impl WgpuEngine { } }) .collect::>(); - let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { - label: None, - entries: &entries, - }); - let compute_pipeline_layout = - device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { - label: None, - bind_group_layouts: &[&bind_group_layout], - push_constant_ranges: &[], + #[cfg(not(target_arch = "wasm32"))] + if let Some(uninit) = self.shaders_to_initialise.as_mut() { + let id = add(Shader { + label, + wgpu: None, + cpu: None, + })?; + uninit.push(UninitialisedShader { + wgsl, + label, + entries, + shader_id: id, }); - let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { - label: Some(label), - layout: Some(&compute_pipeline_layout), - module: &shader_module, - entry_point: "main", - }); + return Ok(id); + } + let wgpu = Self::create_compute_pipeline(device, label, wgsl, entries); add(Shader { - wgpu: Some(WgpuShader { - pipeline, - bind_group_layout, - }), + wgpu: Some(wgpu), cpu: None, label, }) @@ -532,6 +617,38 @@ impl WgpuEngine { pub fn free_download(&mut self, buf: BufProxy) { self.downloads.remove(&buf.id); } + + fn create_compute_pipeline( + device: &Device, + label: &str, + wgsl: Cow<'_, str>, + entries: Vec, + ) -> WgpuShader { + let shader_module = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: Some(label), + source: wgpu::ShaderSource::Wgsl(wgsl), + }); + let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &entries, + }); + let compute_pipeline_layout = + device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: Some(label), + layout: Some(&compute_pipeline_layout), + module: &shader_module, + entry_point: "main", + }); + WgpuShader { + pipeline, + bind_group_layout, + } + } } impl BindMap {