From 7d1e0125116ab62535413723f09769fc821faa9f Mon Sep 17 00:00:00 2001 From: Connor Fitzgerald Date: Mon, 27 Nov 2023 02:24:23 -0500 Subject: [PATCH] Gpu culling (#489) Co-authored-by: Connor Fitzgerald --- .github/workflows/ci.yml | 2 + Cargo.toml | 2 +- deny.toml | 2 - examples/animation/src/lib.rs | 10 +- examples/cube-no-framework/src/main.rs | 13 +- examples/cube/src/lib.rs | 8 +- examples/egui/src/main.rs | 8 +- examples/scene-viewer/src/lib.rs | 48 +- examples/skinning/src/lib.rs | 8 +- examples/static-gltf/src/main.rs | 8 +- examples/textured-quad/src/main.rs | 8 +- rend3-framework/src/lib.rs | 1 + rend3-routine/Cargo.toml | 8 +- rend3-routine/shaders/src/cull.wgsl | 396 +++++++++-- rend3-routine/shaders/src/depth.wgsl | 28 +- rend3-routine/shaders/src/hi_z.wgsl | 33 + rend3-routine/shaders/src/opaque.wgsl | 28 +- .../shaders/src/resolve_depth_min.wgsl | 27 + rend3-routine/shaders/src/structures.wgsl | 36 +- .../shaders/src/structures_object.wgsl | 13 +- rend3-routine/shaders/src/uniform_prep.wgsl | 27 + rend3-routine/src/base.rs | 224 ++++-- rend3-routine/src/clear.rs | 28 +- rend3-routine/src/common/camera.rs | 36 + rend3-routine/src/common/interfaces.rs | 9 + rend3-routine/src/common/mod.rs | 2 + rend3-routine/src/culling/batching.rs | 116 ++- rend3-routine/src/culling/culler.rs | 665 ++++++++++++++---- rend3-routine/src/culling/mod.rs | 6 +- rend3-routine/src/culling/suballoc.rs | 227 ++++++ rend3-routine/src/forward.rs | 128 +++- rend3-routine/src/hi_z.rs | 276 ++++++++ rend3-routine/src/lib.rs | 1 + rend3-routine/src/pbr/routine.rs | 14 +- rend3-routine/src/shaders.rs | 34 +- rend3-routine/src/skinning.rs | 4 +- rend3-test/Cargo.toml | 1 + rend3-test/src/helpers.rs | 13 +- rend3-test/src/lib.rs | 2 + rend3-test/src/runner.rs | 29 +- rend3-test/src/threshold.rs | 73 ++ rend3-test/tests/msaa.rs | 6 +- rend3-test/tests/object.rs | 61 ++ .../results/object/multi-frame-add-0.png | Bin 0 -> 439 bytes .../results/object/multi-frame-add-1.png | Bin 0 -> 435 bytes rend3-test/tests/results/shadow/cube.png | Bin 5966 -> 5415 bytes rend3-test/tests/results/simple/empty.png | Bin 0 -> 434 bytes rend3-test/tests/root.rs | 1 + rend3-test/tests/shadow.rs | 20 +- rend3-test/tests/simple.rs | 42 +- rend3-types/src/lib.rs | 29 +- rend3/shaders/vertex_attributes.wgsl | 37 +- rend3/src/graph/data_handle.rs | 37 - rend3/src/graph/graph.rs | 9 + rend3/src/graph/mod.rs | 53 +- rend3/src/graph/node.rs | 15 + rend3/src/graph/texture_store.rs | 2 +- rend3/src/managers/graph_storage.rs | 26 +- rend3/src/managers/handle_alloc.rs | 37 +- rend3/src/managers/object.rs | 52 +- rend3/src/renderer/eval.rs | 11 +- rend3/src/renderer/mod.rs | 16 +- rend3/src/shader.rs | 28 +- rend3/src/util/bind_merge.rs | 9 + rend3/src/util/buffer.rs | 13 +- rend3/src/util/freelist/buffer.rs | 20 +- rend3/src/util/math.rs | 96 ++- rend3/src/util/scatter_copy.rs | 4 +- 68 files changed, 2730 insertions(+), 496 deletions(-) create mode 100644 rend3-routine/shaders/src/hi_z.wgsl create mode 100644 rend3-routine/shaders/src/resolve_depth_min.wgsl create mode 100644 rend3-routine/shaders/src/uniform_prep.wgsl create mode 100644 rend3-routine/src/common/camera.rs create mode 100644 rend3-routine/src/culling/suballoc.rs create mode 100644 rend3-routine/src/hi_z.rs create mode 100644 rend3-test/src/threshold.rs create mode 100644 rend3-test/tests/object.rs create mode 100644 rend3-test/tests/results/object/multi-frame-add-0.png create mode 100644 rend3-test/tests/results/object/multi-frame-add-1.png create mode 100644 rend3-test/tests/results/simple/empty.png delete mode 100644 rend3/src/graph/data_handle.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8589f2eb..98d4572b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,6 +79,8 @@ jobs: if: matrix.target != 'wasm32-unknown-unknown' - uses: actions/upload-artifact@v3 + # always run + if: ${{ !cancelled() }} with: name: comparison-images-${{ matrix.name }} path: | diff --git a/Cargo.toml b/Cargo.toml index 5fc6ff1a..ba1f3401 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ members = [ "rend3-gltf", "rend3-routine", "rend3-test", - "rend3-types" + "rend3-types", ] [profile.ci] diff --git a/deny.toml b/deny.toml index 446bd270..3c631336 100644 --- a/deny.toml +++ b/deny.toml @@ -13,8 +13,6 @@ license-files = [{ path = "COPYRIGHT", hash = 972598577 }] multiple-versions = "deny" wildcards = "allow" skip = [ - # hashbrown - { name = "ahash", version = "0.7.6" }, # gltf / reqwest { name = "base64", version = "0.13.1" }, # ddsfile diff --git a/examples/animation/src/lib.rs b/examples/animation/src/lib.rs index 458ad3ba..2ddcfa72 100644 --- a/examples/animation/src/lib.rs +++ b/examples/animation/src/lib.rs @@ -2,7 +2,7 @@ use std::{path::Path, sync::Arc}; use rend3::types::DirectionalLightHandle; -const SAMPLE_COUNT: rend3::types::SampleCount = rend3::types::SampleCount::Four; +const SAMPLE_COUNT: rend3::types::SampleCount = rend3::types::SampleCount::One; /// The application data, can only be obtained at `setup` time, so it's under an /// Option in the main struct. @@ -164,8 +164,12 @@ impl rend3_framework::App for AnimationExample { let mut graph = rend3::graph::RenderGraph::new(); // Import the surface texture into the render graph. - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/cube-no-framework/src/main.rs b/examples/cube-no-framework/src/main.rs index fa04e20a..6f294959 100644 --- a/examples/cube-no-framework/src/main.rs +++ b/examples/cube-no-framework/src/main.rs @@ -1,3 +1,5 @@ +#![cfg_attr(target_arch = "wasm32", allow(clippy::arc_with_non_send_sync))] + use std::sync::Arc; fn vertex(pos: [f32; 3]) -> glam::Vec3 { @@ -107,8 +109,13 @@ fn main() { let base_rendergraph = rend3_routine::base::BaseRenderGraph::new(&renderer, &spp); let mut data_core = renderer.data_core.lock(); - let pbr_routine = - rend3_routine::pbr::PbrRoutine::new(&renderer, &mut data_core, &spp, &base_rendergraph.interfaces); + let pbr_routine = rend3_routine::pbr::PbrRoutine::new( + &renderer, + &mut data_core, + &spp, + &base_rendergraph.interfaces, + &base_rendergraph.gpu_culler.culling_buffer_map_handle, + ); drop(data_core); let tonemapping_routine = rend3_routine::tonemapping::TonemappingRoutine::new( &renderer, @@ -209,7 +216,7 @@ fn main() { // Import the surface texture into the render graph. let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + graph.add_imported_render_target(&frame, 0..1, 0..1, rend3::graph::ViewportRect::from_size(resolution)); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/cube/src/lib.rs b/examples/cube/src/lib.rs index 10c04bcc..14faa4d0 100644 --- a/examples/cube/src/lib.rs +++ b/examples/cube/src/lib.rs @@ -167,8 +167,12 @@ impl rend3_framework::App for CubeExample { let mut graph = rend3::graph::RenderGraph::new(); // Import the surface texture into the render graph. - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/egui/src/main.rs b/examples/egui/src/main.rs index 98c4d0c5..70b81753 100644 --- a/examples/egui/src/main.rs +++ b/examples/egui/src/main.rs @@ -216,8 +216,12 @@ impl rend3_framework::App for EguiExample { let mut graph = rend3::graph::RenderGraph::new(); // Import the surface texture into the render graph. - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/scene-viewer/src/lib.rs b/examples/scene-viewer/src/lib.rs index bb01576d..bd61d8bc 100644 --- a/examples/scene-viewer/src/lib.rs +++ b/examples/scene-viewer/src/lib.rs @@ -166,6 +166,22 @@ fn extract_vsync(value: &str) -> Result }) } +fn extract_array(value: &str, default: [f32; N]) -> Result<[f32; N], &'static str> { + let mut res = default; + let split: Vec<_> = value.split(',').enumerate().collect(); + + if split.len() != N { + return Err("Mismatched argument count"); + } + + for (idx, inner) in split { + let inner = inner.trim(); + + res[idx] = inner.parse().map_err(|_| "Cannot parse argument number")?; + } + Ok(res) +} + fn extract_vec3(value: &str) -> Result { let mut res = [0.0_f32, 0.0, 0.0]; let split: Vec<_> = value.split(',').enumerate().collect(); @@ -254,6 +270,7 @@ Assets: Controls: --walk Walk speed (speed without holding shift) in units/second (typically meters). Default 10. --run Run speed (speed while holding shift) in units/second (typically meters). Default 50. + --camera x,y,z,pitch,yaw Spawns the camera at the given position. Press Period to get the current camera position. "; struct SceneViewer { @@ -326,6 +343,10 @@ impl SceneViewer { // Controls let walk_speed = args.value_from_str("--walk").unwrap_or(10.0_f32); let run_speed = args.value_from_str("--run").unwrap_or(50.0_f32); + let camera_default = [3.0, 3.0, 3.0, -std::f32::consts::FRAC_PI_8, std::f32::consts::FRAC_PI_4]; + let camera_info = args + .value_from_str("--camera") + .map_or(camera_default, |s: String| extract_array(&s, camera_default).unwrap()); // Free args let file_to_load: Option = args.free_from_str().ok(); @@ -382,9 +403,9 @@ impl SceneViewer { fullscreen, scancode_status: FastHashMap::default(), - camera_pitch: -std::f32::consts::FRAC_PI_8, - camera_yaw: std::f32::consts::FRAC_PI_4, - camera_location: Vec3A::new(3.0, 3.0, 3.0), + camera_pitch: camera_info[3], + camera_yaw: camera_info[4], + camera_location: Vec3A::new(camera_info[0], camera_info[1], camera_info[2]), previous_profiling_stats: None, timestamp_last_second: Instant::now(), timestamp_last_frame: Instant::now(), @@ -524,6 +545,8 @@ impl rend3_framework::App for SceneViewer { self.timestamp_last_frame = now; + // std::thread::sleep(Duration::from_millis(100)); + let rotation = Mat3A::from_euler(glam::EulerRot::XYZ, -self.camera_pitch, -self.camera_yaw, 0.0).transpose(); let forward = -rotation.z_axis; @@ -549,8 +572,15 @@ impl rend3_framework::App for SceneViewer { if button_pressed(&self.scancode_status, platform::Scancodes::Q) { self.camera_location += up * velocity * delta_time.as_secs_f32(); } - if button_pressed(&self.scancode_status, platform::Scancodes::Z) { - self.camera_location -= up * velocity * delta_time.as_secs_f32(); + if button_pressed(&self.scancode_status, platform::Scancodes::PERIOD) { + println!( + "{x},{y},{z},{pitch},{yaw}", + x = self.camera_location.x, + y = self.camera_location.y, + z = self.camera_location.z, + pitch = self.camera_pitch, + yaw = self.camera_yaw + ); } if button_pressed(&self.scancode_status, platform::Scancodes::ESCAPE) { @@ -595,8 +625,12 @@ impl rend3_framework::App for SceneViewer { // Build a rendergraph let mut graph = rend3::graph::RenderGraph::new(); - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/skinning/src/lib.rs b/examples/skinning/src/lib.rs index 8cd322c2..ec7401b1 100644 --- a/examples/skinning/src/lib.rs +++ b/examples/skinning/src/lib.rs @@ -158,8 +158,12 @@ impl rend3_framework::App for SkinningExample { // Build a rendergraph let mut graph = rend3::graph::RenderGraph::new(); - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/static-gltf/src/main.rs b/examples/static-gltf/src/main.rs index e60b759f..3402f367 100644 --- a/examples/static-gltf/src/main.rs +++ b/examples/static-gltf/src/main.rs @@ -149,8 +149,12 @@ impl rend3_framework::App for GltfExample { let mut graph = rend3::graph::RenderGraph::new(); // Import the surface texture into the render graph. - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph without a skybox base_rendergraph.add_to_graph( &mut graph, diff --git a/examples/textured-quad/src/main.rs b/examples/textured-quad/src/main.rs index 26d29029..2545ed4f 100644 --- a/examples/textured-quad/src/main.rs +++ b/examples/textured-quad/src/main.rs @@ -174,8 +174,12 @@ impl rend3_framework::App for TexturedQuadExample { let mut graph = rend3::graph::RenderGraph::new(); // Import the surface texture into the render graph. - let frame_handle = - graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution)); + let frame_handle = graph.add_imported_render_target( + &frame, + 0..1, + 0..1, + rend3::graph::ViewportRect::from_size(resolution), + ); // Add the default rendergraph base_rendergraph.add_to_graph( &mut graph, diff --git a/rend3-framework/src/lib.rs b/rend3-framework/src/lib.rs index 12299c05..a52fec37 100644 --- a/rend3-framework/src/lib.rs +++ b/rend3-framework/src/lib.rs @@ -259,6 +259,7 @@ pub async fn async_start + 'static, T: 'static>(mut app: A, window_bui &mut data_core, &spp, &base_rendergraph.interfaces, + &base_rendergraph.gpu_culler.culling_buffer_map_handle, )), skybox: Mutex::new(rend3_routine::skybox::SkyboxRoutine::new( &renderer, diff --git a/rend3-routine/Cargo.toml b/rend3-routine/Cargo.toml index dd624ad3..881834eb 100644 --- a/rend3-routine/Cargo.toml +++ b/rend3-routine/Cargo.toml @@ -15,20 +15,18 @@ rust-version = "1.71" arrayvec = "0.7" bitflags = "2" bytemuck = "1" +codespan-reporting = "0.11" encase = { version = "0.6", features = ["glam"] } flume = "0.11" glam = { version = "0.24.0", features = ["bytemuck"] } log = "0.4" +naga = { version = "0.14", features = ["wgsl-in"] } ordered-float = "4" parking_lot = "0.12" profiling = {version = "1", default-features = false } rend3 = { version = "^0.3.0", path = "../rend3" } rust-embed = { version = "8", features = ["interpolate-folder-path"] } serde = { version = "1", features = ["derive"] } +serde_json = "1" wgpu = "0.18.0" wgpu-profiler = "0.15.0" - -[dev-dependencies] -codespan-reporting = "0.11" -naga = { version = "0.14", features = ["wgsl-in"] } -serde_json = { version = "1" } diff --git a/rend3-routine/shaders/src/cull.wgsl b/rend3-routine/shaders/src/cull.wgsl index 0809ebb5..9dc7792a 100644 --- a/rend3-routine/shaders/src/cull.wgsl +++ b/rend3-routine/shaders/src/cull.wgsl @@ -5,72 +5,374 @@ var vertex_buffer: array; @group(0) @binding(1) var object_buffer: array; + +fn vertex_fetch( + object_invocation: u32, + object_info: ptr, +) -> Triangle { + let index_0_index = object_invocation * 3u + 0u; + let index_1_index = object_invocation * 3u + 1u; + let index_2_index = object_invocation * 3u + 2u; + + let object = object_buffer[(*object_info).object_id]; + + let index0 = vertex_buffer[object.first_index + index_0_index]; + let index1 = vertex_buffer[object.first_index + index_1_index]; + let index2 = vertex_buffer[object.first_index + index_2_index]; + + let position_start_offset = object.vertex_attribute_start_offsets[{{position_attribute_offset}}]; + let model_position0 = extract_attribute_vec3_f32(position_start_offset, index0); + let model_position1 = extract_attribute_vec3_f32(position_start_offset, index1); + let model_position2 = extract_attribute_vec3_f32(position_start_offset, index2); + + return Triangle( + TriangleVertices(model_position0, model_position1, model_position2), + TriangleIndices(index0, index1, index2) + ); +} + @group(0) @binding(2) var culling_job: BatchData; + +struct DrawCallBuffer { + /// We always put the buffer that needs to be present in the next frame first. + predicted_object_offset: u32, + residual_object_offset: u32, + calls: array, +} + @group(0) @binding(3) -var output_buffer: array; +var draw_calls: DrawCallBuffer; + +fn init_draw_calls(global_invocation: u32, region_id: u32) { + // Init the inheritable draw call + let predicted_object_draw_index = draw_calls.predicted_object_offset + region_id; + draw_calls.calls[predicted_object_draw_index].vertex_offset = 0; + draw_calls.calls[predicted_object_draw_index].instance_count = 1u; + draw_calls.calls[predicted_object_draw_index].base_instance = 0u; + draw_calls.calls[predicted_object_draw_index].base_index = global_invocation * 3u; + + // Init the residual objects draw call + let residual_object_draw_index = draw_calls.residual_object_offset + region_id; + draw_calls.calls[residual_object_draw_index].vertex_offset = 0; + draw_calls.calls[residual_object_draw_index].instance_count = 1u; + draw_calls.calls[residual_object_draw_index].base_instance = 0u; + draw_calls.calls[residual_object_draw_index].base_index = global_invocation * 3u; +} + +fn add_predicted_triangle_to_draw_call(region_id: u32) -> u32 { + let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.predicted_object_offset + region_id].vertex_count, 3u); + let output_region_triangle = output_region_index / 3u; + return output_region_triangle; +} + +fn add_residual_triangle_to_draw_call(region_id: u32) -> u32 { + let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.residual_object_offset + region_id].vertex_count, 3u); + let output_region_triangle = output_region_index / 3u; + return output_region_triangle; +} + +struct OutputIndexBuffer { + /// We always put the buffer that needs to be present in the next frame first. + predicted_object_offset: u32, + residual_object_offset: u32, + indices: array, +} +@group(0) @binding(4) +var output_indices : OutputIndexBuffer; + +fn write_predicted_atomic_triangle( + batch_object_index: u32, + object_info: ptr, + indices: TriangleIndices, +) { + let region_invocation = add_predicted_triangle_to_draw_call((*object_info).region_id); + let batch_invocation = region_invocation + (*object_info).region_base_invocation; + let global_invocation = batch_invocation + culling_job.batch_base_invocation; + + let packed_indices = pack_batch_indices(batch_object_index, indices); + + let predicted_object_indices_index = output_indices.predicted_object_offset + global_invocation * 3u; + output_indices.indices[predicted_object_indices_index] = packed_indices[0]; + output_indices.indices[predicted_object_indices_index + 1u] = packed_indices[1]; + output_indices.indices[predicted_object_indices_index + 2u] = packed_indices[2]; +} + +fn write_residual_atomic_triangle( + batch_object_index: u32, + object_info: ptr, + indices: TriangleIndices, +) { + let region_invocation = add_residual_triangle_to_draw_call((*object_info).region_id); + let batch_invocation = region_invocation + (*object_info).region_base_invocation; + let global_invocation = batch_invocation + culling_job.batch_base_invocation; -struct ObjectRangeIndex { - range: ObjectRange, - index: u32, + let packed_indices = pack_batch_indices(batch_object_index, indices); + + let residual_object_indices_index = output_indices.residual_object_offset + global_invocation * 3u; + output_indices.indices[residual_object_indices_index] = packed_indices[0]; + output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1]; + output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2]; +} + +fn write_residual_nonatomic_triangle( + invocation: u32, + batch_object_index: u32, + object_info: ptr, + indices: TriangleIndices, +) { + add_residual_triangle_to_draw_call((*object_info).region_id); + + let packed_indices = pack_batch_indices(batch_object_index, indices); + + let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u; + output_indices.indices[residual_object_indices_index] = packed_indices[0]; + output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1]; + output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2]; +} + +fn write_invalid_residual_nonatomic_triangle(invocation: u32, object_info: ptr) { + add_residual_triangle_to_draw_call((*object_info).region_id); + + let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u; + output_indices.indices[residual_object_indices_index] = INVALID_VERTEX; + output_indices.indices[residual_object_indices_index + 1u] = INVALID_VERTEX; + output_indices.indices[residual_object_indices_index + 2u] = INVALID_VERTEX; +} + +struct CullingResults { + /// We always put the buffer that needs to be present in the next frame first. + output_offset: u32, + input_offset: u32, + bits: array, +} +@group(0) @binding(5) +var culling_results: CullingResults; + +fn get_previous_culling_result(object_info: ptr, object_invocation: u32) -> bool { + if (*object_info).previous_global_invocation == 0xFFFFFFFFu { + return false; + } + + let previous_global_invocation = object_invocation + (*object_info).previous_global_invocation; + let bitmask = culling_results.bits[culling_results.input_offset + (previous_global_invocation / 32u)]; + return ((bitmask >> (previous_global_invocation % 32u)) & 0x1u) == 0x1u; } -var workgroup_object_range: ObjectRangeIndex; +@group(0) @binding(6) +var per_camera_uniform: PerCameraUniform; + +fn is_shadow_pass() -> bool { + return per_camera_uniform.shadow_index != 0xFFFFFFFFu; +} + +@group(0) @binding(7) +var hirearchical_z_buffer: texture_depth_2d; +@group(0) @binding(8) +var nearest_sampler: sampler; + +{{include "rend3/vertex_attributes.wgsl"}} + +struct ObjectSearchResult { + range: ObjectCullingInformation, + index_within_region: u32, +} + +fn find_object_info(wid: u32) -> ObjectSearchResult { + let target_invocation = wid * 64u; + // pulled directly from https://doc.rust-lang.org/src/core/slice/mod.rs.html#2412-2438 + + var size = culling_job.total_objects; + var left = 0u; + var right = size; + var object_info: ObjectCullingInformation; + while left < right { + let mid = left + size / 2u; + + let probe = culling_job.object_culling_information[mid]; + + if probe.invocation_end <= target_invocation { + left = mid + 1u; + } else if probe.invocation_start > target_invocation { + right = mid; + } else { + return ObjectSearchResult(probe, mid); + } + + size = right - left; + } + + // This is unreachable, but required for the compiler to be happy + return ObjectSearchResult(object_info, 0xFFFFFFFFu); +} -@compute @workgroup_size(256) +// 64 workgroup size / 32 bits +var workgroup_culling_results: array, 2>; + +fn clear_culling_results(lid: u32) { + if lid == 0u { + atomicStore(&workgroup_culling_results[0], 0u); + atomicStore(&workgroup_culling_results[1], 0u); + } +} + +fn compute_culling_results(lid: u32, passed_culling: bool) { + atomicOr(&workgroup_culling_results[lid / 32u], u32(passed_culling) << (lid % 32u)); +} + +fn save_culling_results(global_invocation: u32, lid: u32) { + if lid == 0u { + let culling_results_index = culling_results.output_offset + (global_invocation / 32u); + culling_results.bits[culling_results_index + 0u] = atomicLoad(&workgroup_culling_results[0]); + culling_results.bits[culling_results_index + 1u] = atomicLoad(&workgroup_culling_results[1]); + } +} + +fn textureSampleMin(texture: texture_depth_2d, uv: vec2, mipmap: f32) -> f32 { + let int_mipmap = i32(mipmap); + let mip_resolution = vec2(textureDimensions(texture, int_mipmap).xy); + + let pixel_coords = uv * mip_resolution - 0.5; + + let low = vec2(max(floor(pixel_coords), vec2(0.0))); + let high = vec2(min(ceil(pixel_coords), mip_resolution - 1.0)); + + let top_left = vec2(low.x, low.y); + let top_right = vec2(high.x, low.y); + let bottom_left = vec2(low.x, high.y); + let bottom_right = vec2(high.x, high.y); + + var minval = textureLoad(texture, top_left, int_mipmap); + minval = min(minval, textureLoad(texture, top_right, int_mipmap)); + minval = min(minval, textureLoad(texture, bottom_left, int_mipmap)); + minval = min(minval, textureLoad(texture, bottom_right, int_mipmap)); + return minval; +} + +fn execute_culling( + model_view_proj: mat4x4, + vertices: TriangleVertices, +) -> bool { + let position0 = model_view_proj * vec4(vertices[0], 1.0); + let position1 = model_view_proj * vec4(vertices[1], 1.0); + let position2 = model_view_proj * vec4(vertices[2], 1.0); + + let det = determinant(mat3x3(position0.xyw, position1.xyw, position2.xyw)); + + if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_POSITIVE_AREA_VISIBLE && det <= 0.0 { + return false; + } + if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_NEGATIVE_AREA_VISIBLE && det >= 0.0 { + return false; + } + + let ndc0 = position0.xyz / position0.w; + let ndc1 = position1.xyz / position1.w; + let ndc2 = position2.xyz / position2.w; + + let min_ndc_xy = min(ndc0.xy, min(ndc1.xy, ndc2.xy)); + let max_ndc_xy = max(ndc0.xy, max(ndc1.xy, ndc2.xy)); + + let half_res = per_camera_uniform.resolution / 2.0; + let min_screen_xy = (min_ndc_xy + 1.0) * half_res; + let max_screen_xy = (max_ndc_xy + 1.0) * half_res; + + if (per_camera_uniform.flags & PCU_FLAGS_MULTISAMPLE_MASK) == PCU_FLAGS_MULTISAMPLE_DISABLED { + let misses_pixel_center = any(round(min_screen_xy) == round(max_screen_xy)); + + if misses_pixel_center { + return false; + } + } + + // We skip hi-z calculation if we're doing a shadow pass + if per_camera_uniform.shadow_index != 0xFFFFFFFFu { + return true; + } + + var min_tex_coords = (min_ndc_xy + 1.0) / 2.0; + var max_tex_coords = (max_ndc_xy + 1.0) / 2.0; + min_tex_coords.y = 1.0 - min_tex_coords.y; + max_tex_coords.y = 1.0 - max_tex_coords.y; + + let uv = (max_tex_coords + min_tex_coords) / 2.0; + let edges = max_screen_xy - min_screen_xy; + + let longest_edge = max(edges.x, edges.y); + let mip = ceil(log2(max(longest_edge, 1.0))); + + let depth = max(max(ndc0.z, ndc1.z), ndc2.z); + let occlusion_depth = textureSampleMin(hirearchical_z_buffer, uv, mip); + + if depth < occlusion_depth { + return false; + } + + return true; +} + +@compute @workgroup_size(64) fn cs_main( @builtin(workgroup_id) wid: vec3, @builtin(global_invocation_id) gid: vec3, @builtin(local_invocation_id) lid: vec3, ) { - if (lid.x == 0u) { - let target_invocation = wid.x * 256u; - // pulled directly from https://doc.rust-lang.org/src/core/slice/mod.rs.html#2412-2438 - - var size = culling_job.total_objects; - var left = 0u; - var right = size; - while left < right { - let mid = left + size / 2u; - - let probe = culling_job.ranges[mid]; - - if probe.invocation_end <= target_invocation { - left = mid + 1u; - } else if probe.invocation_start > target_invocation { - right = mid; - } else { - workgroup_object_range = ObjectRangeIndex(probe, mid); - break; - } + clear_culling_results(lid.x); + + let object_search_results = find_object_info(wid.x); + var object_info = object_search_results.range; + let batch_object_index = object_search_results.index_within_region; + let global_invocation = culling_job.batch_base_invocation + gid.x; - size = right - left; + // We need the workgroupBarrier to be in uniform control flow, so we can't return early here. + // + // If this is true, continue working on the other side of the barrier. + var write_culling_output = true; + if gid.x >= object_info.invocation_end { + if object_info.atomic_capable == 0u { + write_invalid_residual_nonatomic_triangle(global_invocation, &object_info); } - } + write_culling_output = false; + } else { + let object_invocation = gid.x - object_info.invocation_start; - workgroupBarrier(); + // If the first invocation in the region, set the region's draw call + if object_info.local_region_id == 0u && object_invocation == 0u { + init_draw_calls(global_invocation, object_info.region_id); + } - let object_range = workgroup_object_range.range; - let local_object_index = workgroup_object_range.index; + let triangle = vertex_fetch(object_invocation, &object_info); - if (gid.x >= object_range.invocation_end) { - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 0u] = 0x00FFFFFFu; - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 1u] = 0x00FFFFFFu; - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 2u] = 0x00FFFFFFu; - return; - } + let model_view_proj = per_camera_uniform.objects[object_info.object_id].model_view_proj; - let index_0_index = (gid.x - object_range.invocation_start) * 3u + 0u; - let index_1_index = (gid.x - object_range.invocation_start) * 3u + 1u; - let index_2_index = (gid.x - object_range.invocation_start) * 3u + 2u; + let passes_culling = execute_culling(model_view_proj, triangle.vertices); - let object = object_buffer[object_range.object_id]; + if object_info.atomic_capable == 1u { + if passes_culling { + write_predicted_atomic_triangle(batch_object_index, &object_info, triangle.indices); - let index0 = vertex_buffer[object.first_index + index_0_index]; - let index1 = vertex_buffer[object.first_index + index_1_index]; - let index2 = vertex_buffer[object.first_index + index_2_index]; + if !is_shadow_pass() { + let previously_passed_culling = get_previous_culling_result(&object_info, object_invocation); + + if !previously_passed_culling { + write_residual_atomic_triangle(batch_object_index, &object_info, triangle.indices); + } + } + } + } else { + if passes_culling { + write_residual_nonatomic_triangle(global_invocation, batch_object_index, &object_info, triangle.indices); + } else { + write_invalid_residual_nonatomic_triangle(global_invocation, &object_info); + } + } + + compute_culling_results(lid.x, passes_culling); + } - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 0u] = (local_object_index << 24u) | (index0 & ((1u << 24u) - 1u)); - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 1u] = (local_object_index << 24u) | (index1 & ((1u << 24u) - 1u)); - output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 2u] = (local_object_index << 24u) | (index2 & ((1u << 24u) - 1u)); + workgroupBarrier(); + + if write_culling_output { + save_culling_results(global_invocation, lid.x); + } } diff --git a/rend3-routine/shaders/src/depth.wgsl b/rend3-routine/shaders/src/depth.wgsl index 54984865..9705f756 100644 --- a/rend3-routine/shaders/src/depth.wgsl +++ b/rend3-routine/shaders/src/depth.wgsl @@ -15,16 +15,18 @@ var object_buffer: array; var batch_data: BatchData; @group(1) @binding(2) var vertex_buffer: array; +@group(1) @binding(3) +var per_camera_uniform: PerCameraUniform; {{#if (eq profile "GpuDriven")}} -@group(1) @binding(3) +@group(1) @binding(4) var materials: array; @group(2) @binding(0) var textures: binding_array>; {{/if}} {{#if (eq profile "CpuDriven")}} -@group(1) @binding(3) +@group(1) @binding(4) var materials: array; @group(2) @binding(0) var albedo_tex: texture_2d; @@ -49,19 +51,31 @@ struct VertexOutput { } @vertex -fn vs_main(@builtin(instance_index) shadow_number: u32, @builtin(vertex_index) vertex_index: u32) -> VertexOutput { - if vertex_index == 0x00FFFFFFu { +fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { + // If the vertex index is our sentinel invalid value, return a degenerate triangle. + // + // This is used by the culling shader to discard triangles when the ordering of the + // triangles are important, and atomics can't be used. + if vertex_index == INVALID_VERTEX { var vs_out: VertexOutput; vs_out.position = vec4(0.0); return vs_out; } let indices = unpack_vertex_index(vertex_index); + + let data = object_buffer[indices.object]; + // If the object is disabled, return a degenerate triangle. + // + // This happens when the object is deleted, and we're rendering last-frame's objects. + if data.enabled == 0u { + var vs_out: VertexOutput; + vs_out.position = vec4(0.0); + return vs_out; + } let vs_in = get_vertices(indices); - let data = object_buffer[indices.object]; - // TODO: Store these in uniforms - let model_view_proj = directional_lights.data[shadow_number].view_proj * data.transform; + let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj; let position_vec4 = vec4(vs_in.position, 1.0); diff --git a/rend3-routine/shaders/src/hi_z.wgsl b/rend3-routine/shaders/src/hi_z.wgsl new file mode 100644 index 00000000..ecd1e176 --- /dev/null +++ b/rend3-routine/shaders/src/hi_z.wgsl @@ -0,0 +1,33 @@ +@group(0) @binding(0) +var source: texture_depth_2d; + +struct VertexOutput { + @builtin(position) position: vec4, + @location(0) @interpolate(flat) resolution: vec2, +} + +@vertex +fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput { + let resolution = vec2(textureDimensions(source)); + var output: VertexOutput; + output.position = vec4(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0); + output.resolution = resolution; + return output; +} + +@fragment +fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 { + let this_tex_coord = vec2(vout.position.xy); + let previous_base_tex_coord = 2u * this_tex_coord; + + let count_odd = vout.resolution & vec2(1u); + + var nearest = 1.0; + for (var x = 0u; x < 2u + count_odd.x; x += 1u) { + for (var y = 0u; y < 2u + count_odd.y; y += 1u) { + nearest = min(nearest, textureLoad(source, previous_base_tex_coord + vec2(x, y), 0)); + } + } + + return nearest; +} \ No newline at end of file diff --git a/rend3-routine/shaders/src/opaque.wgsl b/rend3-routine/shaders/src/opaque.wgsl index 739eb6f1..3d805515 100644 --- a/rend3-routine/shaders/src/opaque.wgsl +++ b/rend3-routine/shaders/src/opaque.wgsl @@ -25,16 +25,18 @@ var object_buffer: array; var batch_data: BatchData; @group(1) @binding(2) var vertex_buffer: array; +@group(1) @binding(3) +var per_camera_uniform: PerCameraUniform; {{#if (eq profile "GpuDriven")}} -@group(1) @binding(3) +@group(1) @binding(4) var materials: array; @group(2) @binding(0) var textures: binding_array>; {{/if}} {{#if (eq profile "CpuDriven")}} -@group(1) @binding(3) +@group(1) @binding(4) var materials: array; @group(2) @binding(0) var albedo_tex: texture_2d; @@ -86,19 +88,31 @@ struct VertexOutput { @vertex fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput { - if vertex_index == 0x00FFFFFFu { + // If the vertex index is our sentinel invalid value, return a degenerate triangle. + // + // This is used by the culling shader to discard triangles when the ordering of the + // triangles are important, and atomics can't be used. + if vertex_index == INVALID_VERTEX { var vs_out: VertexOutput; vs_out.position = vec4(0.0); return vs_out; } let indices = unpack_vertex_index(vertex_index); + + let data = object_buffer[indices.object]; + // If the object is disabled, return a degenerate triangle. + // + // This happens when the object is deleted, and we're rendering last-frame's objects. + if data.enabled == 0u { + var vs_out: VertexOutput; + vs_out.position = vec4(0.0); + return vs_out; + } let vs_in = get_vertices(indices); - let data = object_buffer[indices.object]; - // TODO: Store these in uniforms - let model_view = uniforms.view * data.transform; - let model_view_proj = uniforms.view_proj * data.transform; + let model_view = per_camera_uniform.objects[indices.object].model_view; + let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj; let position_vec4 = vec4(vs_in.position, 1.0); let mv_mat3 = mat3x3(model_view[0].xyz, model_view[1].xyz, model_view[2].xyz); diff --git a/rend3-routine/shaders/src/resolve_depth_min.wgsl b/rend3-routine/shaders/src/resolve_depth_min.wgsl new file mode 100644 index 00000000..677ef722 --- /dev/null +++ b/rend3-routine/shaders/src/resolve_depth_min.wgsl @@ -0,0 +1,27 @@ +@group(0) @binding(0) +var source: texture_depth_multisampled_2d; + +const SAMPLES: i32 = {{SAMPLES}}; + +struct VertexOutput { + @builtin(position) position: vec4, +} + +@vertex +fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput { + let resolution = vec2(textureDimensions(source)); + var output: VertexOutput; + output.position = vec4(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0); + return output; +} + +@fragment +fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 { + var nearest: f32 = 1.0; + + for (var sample = 0; sample < SAMPLES; sample += 1) { + nearest = min(nearest, textureLoad(source, vec2u(vout.position.xy), sample)); + } + + return nearest; +} diff --git a/rend3-routine/shaders/src/structures.wgsl b/rend3-routine/shaders/src/structures.wgsl index f01f2cb4..0d85bda4 100644 --- a/rend3-routine/shaders/src/structures.wgsl +++ b/rend3-routine/shaders/src/structures.wgsl @@ -18,7 +18,7 @@ struct ObjectOutputData { } struct IndirectCall { - vertex_count: u32, + vertex_count: atomic, instance_count: u32, base_index: u32, vertex_offset: i32, @@ -37,6 +37,40 @@ struct UniformData { resolution: vec2, } +struct PerCameraUniformObjectData { + // TODO: use less space + model_view: mat4x4, + // TODO: use less space + model_view_proj: mat4x4, +} + +struct PerCameraUniform { + // TODO: use less space + view: mat4x4, + // TODO: use less space + view_proj: mat4x4, + // The index of which shadow caster we are rendering for. + // + // This will be u32::MAX if we're rendering for a camera, not a shadow map. + shadow_index: u32, + frustum: Frustum, + resolution: vec2, + // Uses PCU_FLAGS_* constants + flags: u32, + object_count: u32, + objects: array, +} + +// Area visible +const PCU_FLAGS_AREA_VISIBLE_MASK: u32 = 0x1u; +const PCU_FLAGS_NEGATIVE_AREA_VISIBLE: u32 = 0x0u; +const PCU_FLAGS_POSITIVE_AREA_VISIBLE: u32 = 0x1u; + +// Multisampled +const PCU_FLAGS_MULTISAMPLE_MASK: u32 = 0x2u; +const PCU_FLAGS_MULTISAMPLE_DISABLED: u32 = 0x0u; +const PCU_FLAGS_MULTISAMPLE_ENABLED: u32 = 0x2u; + struct DirectionalLight { /// View/Projection of directional light. Shadow rendering uses viewports /// so this always outputs [-1, 1] no matter where in the atlast the shadow is. diff --git a/rend3-routine/shaders/src/structures_object.wgsl b/rend3-routine/shaders/src/structures_object.wgsl index 19d00fc4..2bb5b32c 100644 --- a/rend3-routine/shaders/src/structures_object.wgsl +++ b/rend3-routine/shaders/src/structures_object.wgsl @@ -7,17 +7,24 @@ struct Object { index_count: u32, material_index: u32, vertex_attribute_start_offsets: array, + // 1 if enabled, 0 if disabled + enabled: u32, } -struct ObjectRange { +struct ObjectCullingInformation { invocation_start: u32, invocation_end: u32, object_id: u32, + region_id: u32, + region_base_invocation: u32, + local_region_id: u32, + previous_global_invocation: u32, + atomic_capable: u32, } struct BatchData { - ranges: array, total_objects: u32, total_invocations: u32, - base_output_invocation: u32, + batch_base_invocation: u32, + object_culling_information: array, } diff --git a/rend3-routine/shaders/src/uniform_prep.wgsl b/rend3-routine/shaders/src/uniform_prep.wgsl new file mode 100644 index 00000000..7cc2ae8f --- /dev/null +++ b/rend3-routine/shaders/src/uniform_prep.wgsl @@ -0,0 +1,27 @@ +{{include "rend3-routine/structures.wgsl"}} +{{include "rend3-routine/structures_object.wgsl"}} + +@group(0) @binding(0) +var object_buffer: array; +@group(0) @binding(1) +var per_camera_uniform: PerCameraUniform; + +@compute @workgroup_size(64) +fn cs_main( + @builtin(global_invocation_id) gid: vec3, +) { + let idx = gid.x; + + if idx >= per_camera_uniform.object_count { + return; + } + if object_buffer[idx].enabled == 0u { + return; + } + + let model_view = per_camera_uniform.view * object_buffer[idx].transform; + let model_view_proj = per_camera_uniform.view_proj * object_buffer[idx].transform; + + per_camera_uniform.objects[idx].model_view = model_view; + per_camera_uniform.objects[idx].model_view_proj = model_view_proj; +} diff --git a/rend3-routine/src/base.rs b/rend3-routine/src/base.rs index aac4755d..6252db3c 100644 --- a/rend3-routine/src/base.rs +++ b/rend3-routine/src/base.rs @@ -27,7 +27,53 @@ use rend3::{ }; use wgpu::{BindGroup, Buffer}; -use crate::{common, culling, forward::RoutineAddToGraphArgs, pbr, skinning, skybox, tonemapping}; +use crate::{ + common::{self, CameraIndex}, + culling, + forward::RoutineAddToGraphArgs, + pbr, skinning, skybox, tonemapping, +}; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct DepthTargets { + pub single_sample_mipped: RenderTargetHandle, + pub multi_sample: Option, +} + +impl DepthTargets { + pub fn new(graph: &mut RenderGraph<'_>, resolution: UVec2, samples: SampleCount) -> Self { + let single_sample_mipped = graph.add_render_target(RenderTargetDescriptor { + label: Some("hdr depth".into()), + resolution, + depth: 1, + mip_levels: None, + samples: SampleCount::One, + format: TextureFormat::Depth32Float, + usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING, + }); + + let multi_sample = samples.needs_resolve().then(|| { + graph.add_render_target(RenderTargetDescriptor { + label: Some("hdr depth multisampled".into()), + resolution, + depth: 1, + mip_levels: Some(1), + samples, + format: TextureFormat::Depth32Float, + usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING, + }) + }); + + Self { + single_sample_mipped, + multi_sample, + } + } + + pub fn rendering_target(&self) -> RenderTargetHandle { + self.multi_sample.unwrap_or(self.single_sample_mipped.set_mips(0..1)) + } +} /// Starter RenderGraph. /// @@ -76,35 +122,58 @@ impl BaseRenderGraph { ambient: Vec4, clear_color: Vec4, ) { - // Create intermediate storage + // Create the data and handles for the graph. let state = BaseRenderGraphIntermediateState::new(graph, eval_output, resolution, samples); - // Preparing and uploading data + // Clear the shadow map. + state.clear_shadow(graph); + + // Prepare all the uniforms that all shaders need access to. state.create_frame_uniforms(graph, self, ambient, resolution); - // Skinning + // Perform compute based skinning. state.skinning(graph, self); - // Culling + // Upload the uniforms for the objects in the shadow pass. + state.shadow_object_uniform_upload(graph, self, eval_output); + // Perform culling for the objects in the shadow pass. state.pbr_shadow_culling(graph, self); - state.pbr_culling(graph, self); - // Depth-only rendering + // Render all the shadows to the shadow map. state.pbr_shadow_rendering(graph, pbr, &eval_output.shadows); - // Clear targets + // Clear the primary render target and depth target. state.clear(graph, clear_color); - // Forward rendering opaque - state.pbr_forward_rendering_opaque(graph, pbr, samples); + // Upload the uniforms for the objects in the forward pass. + state.object_uniform_upload(graph, self, resolution, samples); + + // Do the first pass, rendering the predicted triangles from last frame. + state.pbr_render_opaque_predicted_triangles(graph, pbr, samples); - // Skybox + // Create the hi-z buffer. + state.hi_z(graph, pbr, resolution); + + // Perform culling for the objects in the forward pass. + // + // The result of culling will be used to predict the visible triangles for + // the next frame. It will also render all the triangles that were visible + // but were not predicted last frame. + state.pbr_culling(graph, self); + + // Do the second pass, rendering the residual triangles. + state.pbr_render_opaque_residual_triangles(graph, pbr, samples); + + // Render the skybox. state.skybox(graph, skybox, samples); - // Forward rendering transparent + // Render all transparent objects. + // + // This _must_ happen after culling, as all transparent objects are + // considered "residual". state.pbr_forward_rendering_transparent(graph, pbr, samples); - // Make the reference to the surface + // Tonemap the HDR inner buffer to the output buffer. state.tonemapping(graph, tonemapping, target_texture); } } @@ -115,15 +184,15 @@ impl BaseRenderGraph { /// so desire. pub struct BaseRenderGraphIntermediateState { pub pre_cull: DataHandle, - pub shadow_cull: Vec>, - pub cull: DataHandle, + pub shadow_cull: Vec>>, + pub cull: DataHandle>, pub shadow_uniform_bg: DataHandle, pub forward_uniform_bg: DataHandle, pub shadow: RenderTargetHandle, pub color: RenderTargetHandle, pub resolve: Option, - pub depth: RenderTargetHandle, + pub depth: DepthTargets, pub pre_skinning_buffers: DataHandle, } impl BaseRenderGraphIntermediateState { @@ -146,6 +215,7 @@ impl BaseRenderGraphIntermediateState { label: Some("shadow target".into()), resolution: eval_output.shadow_target_size, depth: 1, + mip_levels: Some(1), samples: SampleCount::One, format: INTERNAL_SHADOW_DEPTH_FORMAT, usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING, @@ -157,6 +227,7 @@ impl BaseRenderGraphIntermediateState { resolution, depth: 1, samples, + mip_levels: Some(1), format: TextureFormat::Rgba16Float, usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING, }); @@ -165,19 +236,13 @@ impl BaseRenderGraphIntermediateState { label: Some("hdr resolve".into()), resolution, depth: 1, + mip_levels: Some(1), samples: SampleCount::One, format: TextureFormat::Rgba16Float, usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING, }) }); - let depth = graph.add_render_target(RenderTargetDescriptor { - label: Some("hdr depth".into()), - resolution, - depth: 1, - samples, - format: TextureFormat::Depth32Float, - usage: TextureUsages::RENDER_ATTACHMENT, - }); + let depth = DepthTargets::new(graph, resolution, samples); let pre_skinning_buffers = graph.add_data::(); @@ -219,6 +284,22 @@ impl BaseRenderGraphIntermediateState { resolution, ); } + pub fn shadow_object_uniform_upload<'node>( + &self, + graph: &mut RenderGraph<'node>, + base: &'node BaseRenderGraph, + eval_output: &InstructionEvaluationOutput, + ) { + for (shadow_index, shadow) in eval_output.shadows.iter().enumerate() { + base.gpu_culler.add_object_uniform_upload_to_graph::( + graph, + CameraIndex::Shadow(shadow_index as u32), + UVec2::splat(shadow.map.size), + SampleCount::One, + &format_sso!("Shadow Culling S{}", shadow_index), + ); + } + } /// Does all shadow culling for the PBR materials. pub fn pbr_shadow_culling<'node>(&self, graph: &mut RenderGraph<'node>, base: &'node BaseRenderGraph) { @@ -226,7 +307,8 @@ impl BaseRenderGraphIntermediateState { base.gpu_culler.add_culling_to_graph::( graph, shadow_culled, - Some(shadow_index), + self.shadow, + CameraIndex::Shadow(shadow_index as u32), &format_sso!("Shadow Culling S{}", shadow_index), ); } @@ -236,15 +318,48 @@ impl BaseRenderGraphIntermediateState { skinning::add_skinning_to_graph(graph, &base.gpu_skinner); } + pub fn object_uniform_upload<'node>( + &self, + graph: &mut RenderGraph<'node>, + base: &'node BaseRenderGraph, + resolution: UVec2, + samples: SampleCount, + ) { + base.gpu_culler.add_object_uniform_upload_to_graph::( + graph, + CameraIndex::Viewport, + resolution, + samples, + "Uniform Bake", + ); + } + /// Does all culling for the forward PBR materials. pub fn pbr_culling<'node>(&self, graph: &mut RenderGraph<'node>, base: &'node BaseRenderGraph) { - base.gpu_culler - .add_culling_to_graph::(graph, self.cull, None, "Primary Culling"); + base.gpu_culler.add_culling_to_graph::( + graph, + self.cull, + self.depth.single_sample_mipped, + CameraIndex::Viewport, + "Primary Culling", + ); + } + + /// Clear all the targets to their needed values + pub fn clear_shadow(&self, graph: &mut RenderGraph<'_>) { + crate::clear::add_clear_to_graph(graph, None, None, self.shadow, Vec4::ZERO, 0.0); } /// Clear all the targets to their needed values pub fn clear(&self, graph: &mut RenderGraph<'_>, clear_color: Vec4) { - crate::clear::add_clear_to_graph(graph, self.color, self.resolve, self.depth, clear_color, 0.0); + crate::clear::add_clear_to_graph( + graph, + Some(self.color), + self.resolve, + self.depth.rendering_target(), + clear_color, + 0.0, + ); } /// Render all shadows for the PBR materials. @@ -261,17 +376,17 @@ impl BaseRenderGraphIntermediateState { routine.add_forward_to_graph(RoutineAddToGraphArgs { graph, whole_frame_uniform_bg: self.shadow_uniform_bg, - culled: *shadow_cull, + culling_output_handle: Some(*shadow_cull), per_material: &pbr.per_material, extra_bgs: None, label: &format!("pbr shadow renderering S{shadow_index}"), samples: SampleCount::One, + camera: CameraIndex::Shadow(shadow_index as u32), color: None, resolve: None, depth: self .shadow - .restrict(0..1, ViewportRect::new(desc.map.offset, UVec2::splat(desc.map.size))), - data: shadow_index as u32, + .set_viewport(ViewportRect::new(desc.map.offset, UVec2::splat(desc.map.size))), }); } } @@ -289,7 +404,7 @@ impl BaseRenderGraphIntermediateState { graph, self.color, self.resolve, - self.depth, + self.depth.rendering_target(), self.forward_uniform_bg, samples, ); @@ -297,7 +412,7 @@ impl BaseRenderGraphIntermediateState { } /// Render the PBR materials. - pub fn pbr_forward_rendering_opaque<'node>( + pub fn pbr_render_opaque_predicted_triangles<'node>( &self, graph: &mut RenderGraph<'node>, pbr: &'node pbr::PbrRoutine, @@ -308,15 +423,40 @@ impl BaseRenderGraphIntermediateState { routine.add_forward_to_graph(RoutineAddToGraphArgs { graph, whole_frame_uniform_bg: self.forward_uniform_bg, - culled: self.cull, + culling_output_handle: None, per_material: &pbr.per_material, extra_bgs: None, - label: "PBR Forward", + label: "PBR Forward Pass 1", samples, + camera: CameraIndex::Viewport, color: Some(self.color), resolve: self.resolve, - depth: self.depth, - data: 0, + depth: self.depth.rendering_target(), + }); + } + } + + /// Render the PBR materials. + pub fn pbr_render_opaque_residual_triangles<'node>( + &self, + graph: &mut RenderGraph<'node>, + pbr: &'node pbr::PbrRoutine, + samples: SampleCount, + ) { + let routines = [&pbr.opaque_routine, &pbr.cutout_routine]; + for routine in routines { + routine.add_forward_to_graph(RoutineAddToGraphArgs { + graph, + whole_frame_uniform_bg: self.forward_uniform_bg, + culling_output_handle: Some(self.cull), + per_material: &pbr.per_material, + extra_bgs: None, + label: "PBR Forward Pass 2", + samples, + camera: CameraIndex::Viewport, + color: Some(self.color), + resolve: self.resolve, + depth: self.depth.rendering_target(), }); } } @@ -331,18 +471,22 @@ impl BaseRenderGraphIntermediateState { pbr.blend_routine.add_forward_to_graph(RoutineAddToGraphArgs { graph, whole_frame_uniform_bg: self.forward_uniform_bg, - culled: self.cull, + culling_output_handle: Some(self.cull), per_material: &pbr.per_material, extra_bgs: None, label: "PBR Forward", + camera: CameraIndex::Viewport, samples, color: Some(self.color), resolve: self.resolve, - depth: self.depth, - data: 0, + depth: self.depth.rendering_target(), }); } + pub fn hi_z<'node>(&self, graph: &mut RenderGraph<'node>, pbr: &'node pbr::PbrRoutine, resolution: UVec2) { + pbr.hi_z.add_hi_z_to_graph(graph, self.depth, resolution); + } + /// Tonemap onto the given render target. pub fn tonemapping<'node>( &self, diff --git a/rend3-routine/src/clear.rs b/rend3-routine/src/clear.rs index 948cb943..576be017 100644 --- a/rend3-routine/src/clear.rs +++ b/rend3-routine/src/clear.rs @@ -9,7 +9,7 @@ use rend3::graph::{ /// it makes it a lot easier to udnerstand where the clear is coming from. pub fn add_clear_to_graph( graph: &mut RenderGraph<'_>, - color: RenderTargetHandle, + color: Option, resolve: Option, depth: RenderTargetHandle, clear_color: Vec4, @@ -17,21 +17,25 @@ pub fn add_clear_to_graph( ) { let mut builder = graph.add_node("Clear"); - let hdr_color_handle = builder.add_render_target(color, NodeResourceUsage::Output); + let hdr_color_handle = builder.add_optional_render_target(color, NodeResourceUsage::Output); let hdr_resolve = builder.add_optional_render_target(resolve, NodeResourceUsage::Output); let hdr_depth_handle = builder.add_render_target(depth, NodeResourceUsage::Output); let _rpass_handle = builder.add_renderpass(RenderPassTargets { - targets: vec![RenderPassTarget { - color: hdr_color_handle, - clear: wgpu::Color { - r: clear_color.x as f64, - g: clear_color.y as f64, - b: clear_color.z as f64, - a: clear_color.w as f64, - }, - resolve: hdr_resolve, - }], + targets: if let Some(hdr_color_handle) = hdr_color_handle { + vec![RenderPassTarget { + color: hdr_color_handle, + clear: wgpu::Color { + r: clear_color.x as f64, + g: clear_color.y as f64, + b: clear_color.z as f64, + a: clear_color.w as f64, + }, + resolve: hdr_resolve, + }] + } else { + vec![] + }, depth_stencil: Some(RenderPassDepthTarget { target: hdr_depth_handle, depth_clear: Some(depth_clear), diff --git a/rend3-routine/src/common/camera.rs b/rend3-routine/src/common/camera.rs new file mode 100644 index 00000000..6f09f5cc --- /dev/null +++ b/rend3-routine/src/common/camera.rs @@ -0,0 +1,36 @@ +/// Index representing which camera we're referring to. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum CameraIndex { + Viewport, + Shadow(u32), +} + +impl CameraIndex { + /// Returns `true` if the camera index is [`Viewport`]. + /// + /// [`Viewport`]: CameraIndex::Viewport + #[must_use] + pub fn is_viewport(&self) -> bool { + matches!(self, Self::Viewport) + } + + /// Returns `true` if the camera index is [`Shadow`]. + /// + /// [`Shadow`]: CameraIndex::Shadow + #[must_use] + pub fn is_shadow(&self) -> bool { + matches!(self, Self::Shadow(..)) + } + + /// Returns a shader compatible index for the camera, using u32::MAX for the viewport camera. + #[must_use] + pub fn to_shader_index(&self) -> u32 { + match *self { + Self::Viewport => u32::MAX, + Self::Shadow(index) => { + assert_ne!(index, u32::MAX, "Shadow camera index cannot be 0xFFFF_FFFF"); + index + } + } + } +} diff --git a/rend3-routine/src/common/interfaces.rs b/rend3-routine/src/common/interfaces.rs index 69ca73b1..cef01b7e 100644 --- a/rend3-routine/src/common/interfaces.rs +++ b/rend3-routine/src/common/interfaces.rs @@ -123,6 +123,15 @@ impl PerMaterialArchetypeInterface { }, None, ) + .append( + ShaderStages::VERTEX_FRAGMENT, + BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + None, + ) .build(device, Some("per material bgl")); Self { diff --git a/rend3-routine/src/common/mod.rs b/rend3-routine/src/common/mod.rs index f8a2bed8..6acc7257 100644 --- a/rend3-routine/src/common/mod.rs +++ b/rend3-routine/src/common/mod.rs @@ -1,7 +1,9 @@ //! Common utilities used throughout the crate. +mod camera; mod interfaces; mod samplers; +pub use camera::*; pub use interfaces::*; pub use samplers::*; diff --git a/rend3-routine/src/culling/batching.rs b/rend3-routine/src/culling/batching.rs index 84eb924f..cff60c9d 100644 --- a/rend3-routine/src/culling/batching.rs +++ b/rend3-routine/src/culling/batching.rs @@ -1,13 +1,16 @@ -use std::cmp::Ordering; +use std::{cmp::Ordering, collections::HashMap}; use encase::ShaderType; use ordered_float::OrderedFloat; use rend3::{ - managers::{CameraManager, MaterialManager, ObjectManager, TextureBindGroupIndex}, - types::{Material, SortingOrder, SortingReason}, - util::math::round_up, + graph::NodeExecutionContext, + managers::{CameraManager, TextureBindGroupIndex}, + types::{GraphDataHandle, Material, RawObjectHandle, SortingOrder, SortingReason}, + util::{math::round_up, typedefs::FastHashMap}, }; +use crate::common::CameraIndex; + use super::{BATCH_SIZE, WORKGROUP_SIZE}; #[derive(Debug)] @@ -19,8 +22,6 @@ pub struct ShaderBatchDatas { #[derive(Debug)] pub(super) struct JobSubRegion { pub job_index: u32, - pub base_invocation: u32, - pub invocation_count: u32, pub key: ShaderJobKey, } @@ -80,48 +81,74 @@ impl Ord for ShaderJobSortingKey { #[derive(Debug, ShaderType)] pub struct ShaderBatchData { #[align(256)] - pub(super) ranges: [ShaderObjectRange; BATCH_SIZE], pub(super) total_objects: u32, pub(super) total_invocations: u32, - pub(super) base_output_invocation: u32, + pub(super) batch_base_invocation: u32, + pub(super) object_culling_information: [ShaderObjectCullingInformation; BATCH_SIZE], } #[derive(Debug, Copy, Clone, Default, ShaderType)] -pub(super) struct ShaderObjectRange { +pub(super) struct ShaderObjectCullingInformation { pub invocation_start: u32, pub invocation_end: u32, pub object_id: u32, + pub region_id: u32, + pub base_region_invocation: u32, + pub local_region_id: u32, + pub previous_global_invocation: u32, + pub atomic_capable: u32, +} + +/// Map containing the previous invocation of each object. +pub struct PerCameraPreviousInvocationsMap { + inner: FastHashMap>, +} +impl PerCameraPreviousInvocationsMap { + pub fn new() -> Self { + Self { + inner: HashMap::default(), + } + } + + pub fn get_and_reset_camera(&mut self, camera: CameraIndex) -> FastHashMap { + self.inner.remove(&camera).unwrap_or_default() + } + + pub fn set_camera(&mut self, camera: CameraIndex, previous_invocations: FastHashMap) { + self.inner.insert(camera, previous_invocations); + } } pub(super) fn batch_objects( - material_manager: &MaterialManager, - object_manager: &ObjectManager, - camera_manager: &CameraManager, - max_dispatch: u32, + ctx: &mut NodeExecutionContext, + previous_invocation_map_handle: &GraphDataHandle, + camera: &CameraManager, + camera_idx: CameraIndex, ) -> ShaderBatchDatas { profiling::scope!("Batch Objects"); + let mut per_camera_previous_invocation_map = ctx.data_core.graph_storage.get_mut(previous_invocation_map_handle); + let previous_invocation_map = per_camera_previous_invocation_map.get_and_reset_camera(camera_idx); + let mut current_invocation_map = FastHashMap::default(); + let mut jobs = ShaderBatchDatas { jobs: Vec::new(), regions: Vec::new(), }; - let objects = match object_manager.enumerated_objects::() { + let objects = match ctx.data_core.object_manager.enumerated_objects::() { Some(o) => o, None => return jobs, }; - let material_archetype = material_manager.archetype_view::(); + let material_archetype = ctx.data_core.material_manager.archetype_view::(); let mut sorted_objects = Vec::with_capacity(objects.len()); { profiling::scope!("Sort Key Creation"); for (handle, object) in objects { // Frustum culling - if !camera_manager - .world_frustum() - .contains_sphere(object.inner.bounding_sphere) - { + if !camera.world_frustum().contains_sphere(object.inner.bounding_sphere) { continue; } @@ -134,7 +161,11 @@ pub(super) fn batch_objects( let material_key = material.inner.key(); let sorting = material.inner.sorting(); - let mut distance_sq = camera_manager.location().distance_squared(object.location.into()); + let mut distance_sq = ctx + .data_core + .camera_manager + .location() + .distance_squared(object.location.into()); if sorting.order == SortingOrder::BackToFront { distance_sq = -distance_sq; } @@ -160,36 +191,49 @@ pub(super) fn batch_objects( if !sorted_objects.is_empty() { profiling::scope!("Batch Data Creation"); + let mut current_region_idx = 0_u32; + let mut current_region_object_index = 0_u32; let mut current_base_invocation = 0_u32; let mut current_region_invocation = 0_u32; let mut current_invocation = 0_u32; let mut current_object_index = 0_u32; - let mut current_ranges = [ShaderObjectRange::default(); BATCH_SIZE]; + let mut current_ranges = [ShaderObjectCullingInformation::default(); BATCH_SIZE]; let mut current_key = sorted_objects.first().unwrap().0.job_key; - for (ShaderJobSortingKey { job_key: key, .. }, handle, object) in sorted_objects { + let max_dispatch_count = ctx.renderer.limits.max_compute_workgroups_per_dimension; + + for ( + ShaderJobSortingKey { + job_key: key, + sorting_reason, + .. + }, + handle, + object, + ) in sorted_objects + { let invocation_count = object.inner.index_count / 3; let key_difference = key != current_key; let object_limit = current_object_index == 256; - let dispatch_limit = (current_invocation + invocation_count) >= max_dispatch * WORKGROUP_SIZE; + let dispatch_limit = (current_invocation + invocation_count) >= max_dispatch_count * WORKGROUP_SIZE; if key_difference || object_limit || dispatch_limit { jobs.regions.push(JobSubRegion { job_index: jobs.jobs.len() as u32, - base_invocation: current_region_invocation, - invocation_count: current_invocation - current_region_invocation, key: current_key, }); + current_region_idx += 1; current_key = key; + current_region_object_index = 0; current_region_invocation = current_invocation; } if object_limit || dispatch_limit { jobs.jobs.push(ShaderBatchData { - ranges: current_ranges, + object_culling_information: current_ranges, total_objects: current_object_index, total_invocations: current_invocation, - base_output_invocation: current_base_invocation, + batch_base_invocation: current_base_invocation, }); current_base_invocation += current_invocation; @@ -198,30 +242,38 @@ pub(super) fn batch_objects( current_object_index = 0; } - let range = ShaderObjectRange { + let range = ShaderObjectCullingInformation { invocation_start: current_invocation, invocation_end: current_invocation + invocation_count, + region_id: current_region_idx, object_id: handle.idx as u32, + base_region_invocation: current_region_invocation, + local_region_id: current_region_object_index, + previous_global_invocation: previous_invocation_map.get(&handle).copied().unwrap_or(u32::MAX), + atomic_capable: matches!(sorting_reason, SortingReason::Optimization) as u32, }; + current_invocation_map.insert(handle, current_invocation + current_base_invocation); + current_ranges[current_object_index as usize] = range; current_object_index += 1; + current_region_object_index += 1; current_invocation += round_up(invocation_count, WORKGROUP_SIZE); } jobs.regions.push(JobSubRegion { job_index: jobs.jobs.len() as u32, - base_invocation: current_region_invocation, - invocation_count: current_invocation - current_region_invocation, key: current_key, }); jobs.jobs.push(ShaderBatchData { - ranges: current_ranges, + object_culling_information: current_ranges, total_objects: current_object_index, total_invocations: current_invocation, - base_output_invocation: current_base_invocation, + batch_base_invocation: current_base_invocation, }); } + per_camera_previous_invocation_map.set_camera(camera_idx, current_invocation_map); + jobs } diff --git a/rend3-routine/src/culling/culler.rs b/rend3-routine/src/culling/culler.rs index 68e0f8ab..bf4dcd7c 100644 --- a/rend3-routine/src/culling/culler.rs +++ b/rend3-routine/src/culling/culler.rs @@ -8,110 +8,191 @@ use std::{ }; use encase::{ShaderSize, ShaderType, StorageBuffer}; +use glam::{Mat4, UVec2, Vec2}; use rend3::{ format_sso, - graph::{DataHandle, NodeExecutionContext, NodeResourceUsage, RenderGraph}, - managers::{ShaderObject, TextureBindGroupIndex}, - types::{GraphDataHandle, Material}, - util::{ - math::{round_up, round_up_div}, - typedefs::FastHashMap, - }, + graph::{DataHandle, DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderTargetHandle}, + managers::{CameraManager, ShaderObject, TextureBindGroupIndex}, + types::{GraphDataHandle, Material, MaterialArray, SampleCount, VERTEX_ATTRIBUTE_POSITION}, + util::{frustum::Frustum, math::IntegerExt, typedefs::FastHashMap}, Renderer, ShaderPreProcessor, ShaderVertexBufferConfig, }; use wgpu::{ - self, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, BindGroupLayoutEntry, - BindingType, Buffer, BufferBinding, BufferBindingType, BufferDescriptor, BufferUsages, ComputePassDescriptor, - ComputePipeline, ComputePipelineDescriptor, Device, PipelineLayoutDescriptor, ShaderModuleDescriptor, ShaderStages, + self, AddressMode, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, + BindGroupLayoutEntry, BindingResource, BindingType, Buffer, BufferBinding, BufferBindingType, BufferDescriptor, + BufferUsages, CommandEncoder, ComputePassDescriptor, ComputePipeline, ComputePipelineDescriptor, Device, + FilterMode, PipelineLayoutDescriptor, Queue, Sampler, SamplerBindingType, SamplerDescriptor, + ShaderModuleDescriptor, ShaderStages, TextureSampleType, TextureViewDimension, }; -use crate::culling::{ - batching::{batch_objects, JobSubRegion, ShaderBatchData, ShaderBatchDatas}, - WORKGROUP_SIZE, +use crate::{ + common::CameraIndex, + culling::{ + batching::{batch_objects, JobSubRegion, PerCameraPreviousInvocationsMap, ShaderBatchData, ShaderBatchDatas}, + suballoc::InputOutputBuffer, + WORKGROUP_SIZE, + }, }; -// 16 MB of indices -const OUTPUT_BUFFER_ROUNDING_SIZE: u64 = 1 << 24; -// At least 64 batches -const BATCH_DATA_ROUNDING_SIZE: u64 = ShaderBatchData::SHADER_SIZE.get() * 64; - #[derive(Debug)] pub struct DrawCallSet { - pub buffers: CullingBuffers>, + pub culling_data_buffer: Buffer, + pub per_camera_uniform: Arc, pub draw_calls: Vec, /// Range of draw calls in the draw call array corresponding to a given material key. pub material_key_ranges: HashMap>, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DrawCall { pub bind_group_index: TextureBindGroupIndex, - pub index_range: Range, pub batch_index: u32, } #[derive(Default)] -struct CullingBufferMap { - inner: FastHashMap, CullingBuffers>>, +pub struct CullingBufferMap { + inner: FastHashMap, } impl CullingBufferMap { - fn get_buffers( + pub fn get_buffers(&self, camera: CameraIndex) -> Option<&CullingBuffers> { + self.inner.get(&camera) + } + + fn get_or_resize_buffers( &mut self, + queue: &Queue, device: &Device, - camera: Option, - mut sizes: CullingBuffers, - ) -> &CullingBuffers> { - sizes.object_reference = round_up(sizes.object_reference.max(1), BATCH_DATA_ROUNDING_SIZE); - sizes.index = round_up(sizes.index.max(1), OUTPUT_BUFFER_ROUNDING_SIZE); - + encoder: &mut CommandEncoder, + camera: CameraIndex, + sizes: CullingBufferSizes, + ) -> &mut CullingBuffers { match self.inner.entry(camera) { Entry::Occupied(b) => { let b = b.into_mut(); - let current_size = CullingBuffers { - object_reference: b.object_reference.size(), - index: b.index.size(), - }; - if current_size != sizes { - *b = CullingBuffers::new(device, sizes); - } + b.update_sizes(queue, device, encoder, sizes); + b } - Entry::Vacant(b) => b.insert(CullingBuffers::new(device, sizes)), + Entry::Vacant(b) => b.insert(CullingBuffers::new(device, queue, sizes)), } } } -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct CullingBuffers { - pub object_reference: T, - pub index: T, +struct CullingBufferSizes { + invocations: u64, + draw_calls: u64, } -impl CullingBuffers> { - pub fn new(device: &Device, sizes: CullingBuffers) -> Self { - CullingBuffers { - object_reference: Arc::new(device.create_buffer(&BufferDescriptor { - label: None, - size: sizes.object_reference, - usage: BufferUsages::STORAGE | BufferUsages::COPY_DST, - mapped_at_creation: false, - })), - index: Arc::new(device.create_buffer(&BufferDescriptor { - label: None, - size: sizes.index, - usage: BufferUsages::STORAGE | BufferUsages::INDEX, - mapped_at_creation: false, - })), +#[derive(Debug)] +pub struct CullingBuffers { + pub index_buffer: InputOutputBuffer, + pub draw_call_buffer: InputOutputBuffer, + pub culling_results_buffer: InputOutputBuffer, +} + +impl CullingBuffers { + fn new(device: &Device, queue: &Queue, sizes: CullingBufferSizes) -> Self { + Self { + // One element per triangle/invocation + index_buffer: InputOutputBuffer::new(device, queue, sizes.invocations, "Index Buffer", 4, 4, false), + draw_call_buffer: InputOutputBuffer::new(device, queue, sizes.draw_calls, "Draw Call Buffer", 20, 4, true), + culling_results_buffer: InputOutputBuffer::new( + device, + queue, + // 32 bits in a u32 + sizes.invocations.div_round_up(u32::BITS as _), + "Culling Results Buffer", + 4, + 4, + false, + ), + } + } + + fn update_sizes( + &mut self, + queue: &Queue, + device: &Device, + encoder: &mut CommandEncoder, + sizes: CullingBufferSizes, + ) { + self.index_buffer.swap(queue, device, encoder, sizes.invocations * 3); + self.draw_call_buffer.swap(queue, device, encoder, sizes.draw_calls); + self.culling_results_buffer + .swap(queue, device, encoder, sizes.invocations.div_round_up(32)); + } +} + +#[derive(Debug, Copy, Clone)] +pub enum TriangleVisibility { + PositiveAreaVisible, + NegativeAreaVisible, +} + +impl TriangleVisibility { + fn from_winding_and_face(winding: wgpu::FrontFace, culling: wgpu::Face) -> Self { + match (winding, culling) { + (wgpu::FrontFace::Ccw, wgpu::Face::Back) => TriangleVisibility::PositiveAreaVisible, + (wgpu::FrontFace::Ccw, wgpu::Face::Front) => TriangleVisibility::NegativeAreaVisible, + (wgpu::FrontFace::Cw, wgpu::Face::Back) => TriangleVisibility::NegativeAreaVisible, + (wgpu::FrontFace::Cw, wgpu::Face::Front) => TriangleVisibility::PositiveAreaVisible, } } + + fn is_positive(self) -> bool { + match self { + TriangleVisibility::PositiveAreaVisible => true, + TriangleVisibility::NegativeAreaVisible => false, + } + } +} + +bitflags::bitflags! { + struct PerCameraUniformFlags: u32 { + const POSTIIVE_AREA_VISIBLE = 1 << 0; + const MULTISAMPLED = 1 << 1; + } +} + +#[derive(ShaderType)] +struct PerCameraUniform { + // TODO: use less space + view: Mat4, + // TODO: use less space + view_proj: Mat4, + // The index of which shadow caster we are rendering for. + // + // This will be u32::MAX if we're rendering for a camera, not a shadow map. + shadow_index: u32, + frustum: Frustum, + resolution: Vec2, + // Created from PerCameraUniformFlags + flags: u32, + object_count: u32, + #[size(runtime)] + objects: Vec, +} + +#[derive(ShaderType)] +struct PerCameraUniformObjectData { + // TODO: use less space + model_view: Mat4, + // TODO: use less space + model_view_proj: Mat4, } pub struct GpuCuller { - bgl: BindGroupLayout, - pipeline: ComputePipeline, + prep_bgl: BindGroupLayout, + prep_pipeline: ComputePipeline, + culling_bgl: BindGroupLayout, + culling_pipeline: ComputePipeline, + sampler: Sampler, + winding: wgpu::FrontFace, type_id: TypeId, - culling_buffer_map_handle: GraphDataHandle, + per_material_buffer_handle: GraphDataHandle>>, + pub culling_buffer_map_handle: GraphDataHandle, + previous_invocation_map_handle: GraphDataHandle, } impl GpuCuller { @@ -121,22 +202,85 @@ impl GpuCuller { { let type_name = type_name::(); - let source = spp + let prep_source = spp .render_shader( - "rend3-routine/cull.wgsl", + "rend3-routine/uniform_prep.wgsl", &(), Some(&ShaderVertexBufferConfig::from_material::()), ) .unwrap(); - let sm = renderer.device.create_shader_module(ShaderModuleDescriptor { + let prep_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { + label: Some(&format_sso!("UniformPrep {type_name} SM")), + source: wgpu::ShaderSource::Wgsl(Cow::Owned(prep_source)), + }); + + let prep_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { + label: Some(&format_sso!("UniformPrep {type_name} BGL")), + entries: &[ + // Object + BindGroupLayoutEntry { + binding: 0, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: Some(ShaderObject::::SHADER_SIZE), + }, + count: None, + }, + // Object + BindGroupLayoutEntry { + binding: 1, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: Some(PerCameraUniform::min_size()), + }, + count: None, + }, + ], + }); + + let prep_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { + label: Some(&format_sso!("UniformPrep {type_name} PLL")), + bind_group_layouts: &[&prep_bgl], + push_constant_ranges: &[], + }); + + let prep_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor { + label: Some(&format_sso!("UniformPrep {type_name} PLL")), + layout: Some(&prep_pll), + module: &prep_sm, + entry_point: "cs_main", + }); + + let position_offset = M::supported_attributes() + .into_iter() + .enumerate() + .find_map(|(idx, a)| (*a == *VERTEX_ATTRIBUTE_POSITION).then_some(idx)) + .unwrap(); + + let culling_source = spp + .render_shader( + "rend3-routine/cull.wgsl", + &serde_json::json! {{ + "position_attribute_offset": position_offset, + }}, + Some(&ShaderVertexBufferConfig::from_material::()), + ) + .unwrap(); + + let culling_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { label: Some(&format_sso!("GpuCuller {type_name} SM")), - source: wgpu::ShaderSource::Wgsl(Cow::Owned(source)), + source: wgpu::ShaderSource::Wgsl(Cow::Owned(culling_source)), }); - let bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { + let culling_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { label: Some(&format_sso!("GpuCuller {type_name} BGL")), entries: &[ + // Vertex Buffer BindGroupLayoutEntry { binding: 0, visibility: ShaderStages::COMPUTE, @@ -147,6 +291,7 @@ impl GpuCuller { }, count: None, }, + // Object Buffer BindGroupLayoutEntry { binding: 1, visibility: ShaderStages::COMPUTE, @@ -157,6 +302,7 @@ impl GpuCuller { }, count: None, }, + // Batch data BindGroupLayoutEntry { binding: 2, visibility: ShaderStages::COMPUTE, @@ -167,43 +313,243 @@ impl GpuCuller { }, count: None, }, + // Draw Calls BindGroupLayoutEntry { binding: 3, visibility: ShaderStages::COMPUTE, ty: BindingType::Buffer { ty: BufferBindingType::Storage { read_only: false }, has_dynamic_offset: false, - min_binding_size: Some(NonZeroU64::new(4).unwrap()), + min_binding_size: Some(NonZeroU64::new(20 + 8).unwrap()), + }, + count: None, + }, + // Index buffer + BindGroupLayoutEntry { + binding: 4, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()), + }, + count: None, + }, + // Culling Results + BindGroupLayoutEntry { + binding: 5, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: false }, + has_dynamic_offset: false, + min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()), }, count: None, }, + // per camera uniforms + BindGroupLayoutEntry { + binding: 6, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Buffer { + ty: BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: Some(PerCameraUniform::min_size()), + }, + count: None, + }, + // hirearchical z buffer + BindGroupLayoutEntry { + binding: 7, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Texture { + sample_type: TextureSampleType::Depth, + view_dimension: TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }, + // hirearchical z buffer + BindGroupLayoutEntry { + binding: 8, + visibility: ShaderStages::COMPUTE, + ty: BindingType::Sampler(SamplerBindingType::NonFiltering), + count: None, + }, ], }); - let pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { + let culling_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { label: Some(&format_sso!("GpuCuller {type_name} PLL")), - bind_group_layouts: &[&bgl], + bind_group_layouts: &[&culling_bgl], push_constant_ranges: &[], }); - let pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor { + let culling_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor { label: Some(&format_sso!("GpuCuller {type_name} PLL")), - layout: Some(&pll), - module: &sm, + layout: Some(&culling_pll), + module: &culling_sm, entry_point: "cs_main", }); + let sampler = renderer.device.create_sampler(&SamplerDescriptor { + label: Some("HiZ Sampler"), + address_mode_u: AddressMode::ClampToEdge, + address_mode_v: AddressMode::ClampToEdge, + address_mode_w: AddressMode::ClampToEdge, + mag_filter: FilterMode::Nearest, + min_filter: FilterMode::Nearest, + mipmap_filter: FilterMode::Nearest, + lod_min_clamp: 0.0, + lod_max_clamp: 100.0, + compare: None, + anisotropy_clamp: 1, + border_color: None, + }); + + let per_material_buffer_handle = renderer.add_graph_data(HashMap::default()); let culling_buffer_map_handle = renderer.add_graph_data(CullingBufferMap::default()); + let previous_invocation_map_handle = renderer.add_graph_data(PerCameraPreviousInvocationsMap::new()); Self { - bgl, - pipeline, + prep_bgl, + prep_pipeline, + culling_bgl, + culling_pipeline, + sampler, + winding: renderer.handedness.into(), type_id: TypeId::of::(), + per_material_buffer_handle, culling_buffer_map_handle, + previous_invocation_map_handle, } } - pub fn cull(&self, ctx: &mut NodeExecutionContext, jobs: ShaderBatchDatas, camera: Option) -> DrawCallSet + pub fn object_uniform_upload( + &self, + ctx: &mut NodeExecutionContext, + camera: &CameraManager, + camera_idx: CameraIndex, + resolution: UVec2, + samples: SampleCount, + ) where + M: Material, + { + profiling::scope!("GpuCuller::object_uniform_upload"); + + assert_eq!(TypeId::of::(), self.type_id); + + let type_name = type_name::(); + + let encoder = ctx.encoder_or_pass.take_encoder(); + + // TODO: Isolate all this into a struct + let max_object_count = ctx + .data_core + .object_manager + .buffer::() + .map(wgpu::Buffer::size) + .unwrap_or(0) + / ShaderObject::::SHADER_SIZE.get(); + + if max_object_count == 0 { + return; + } + + let per_map_buffer_size = ((max_object_count - 1) * PerCameraUniformObjectData::SHADER_SIZE.get()) + + PerCameraUniform::min_size().get(); + + let mut per_mat_buffer_map = ctx.data_core.graph_storage.get_mut(&self.per_material_buffer_handle); + + let new_per_mat_buffer = || { + Arc::new(ctx.renderer.device.create_buffer(&BufferDescriptor { + label: None, + size: per_map_buffer_size, + usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST, + mapped_at_creation: false, + })) + }; + let buffer = match per_mat_buffer_map.entry(camera_idx) { + Entry::Occupied(o) => { + let r = o.into_mut(); + if r.size() != per_map_buffer_size { + *r = new_per_mat_buffer(); + } + r + } + Entry::Vacant(o) => o.insert(new_per_mat_buffer()), + }; + + let culling = match camera_idx { + CameraIndex::Shadow(_) => wgpu::Face::Front, + CameraIndex::Viewport => wgpu::Face::Back, + }; + + { + // We don't write anything in the objects right now, as this will be filled in by the preparation compute shader + profiling::scope!("PerCameraUniform Data Upload"); + let per_camera_data = PerCameraUniform { + view: camera.view(), + view_proj: camera.view_proj(), + shadow_index: camera_idx.to_shader_index(), + frustum: camera.world_frustum(), + resolution: resolution.as_vec2(), + flags: { + let mut flags = PerCameraUniformFlags::empty(); + flags.set( + PerCameraUniformFlags::POSTIIVE_AREA_VISIBLE, + TriangleVisibility::from_winding_and_face(self.winding, culling).is_positive(), + ); + flags.set(PerCameraUniformFlags::MULTISAMPLED, samples != SampleCount::One); + flags.bits() + }, + object_count: max_object_count as u32, + objects: Vec::new(), + }; + let mut buffer = ctx + .renderer + .queue + .write_buffer_with(buffer, 0, per_camera_data.size()) + .unwrap(); + StorageBuffer::new(&mut *buffer).write(&per_camera_data).unwrap(); + } + + let Some(object_manager_buffer) = ctx.data_core.object_manager.buffer::() else { + return; + }; + let prep_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor { + label: Some(&format_sso!("UniformPrep {type_name} BG")), + layout: &self.prep_bgl, + entries: &[ + BindGroupEntry { + binding: 0, + resource: object_manager_buffer.as_entire_binding(), + }, + BindGroupEntry { + binding: 1, + resource: buffer.as_entire_binding(), + }, + ], + }); + + profiling::scope!("Command Encoding"); + + let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor { + label: Some(&format_sso!("GpuCuller {type_name} uniform bake")), + timestamp_writes: None, + }); + cpass.set_pipeline(&self.prep_pipeline); + cpass.set_bind_group(0, &prep_bg, &[]); + cpass.dispatch_workgroups((max_object_count as u32).div_round_up(WORKGROUP_SIZE), 1, 1); + drop(cpass); + } + + pub fn cull( + &self, + ctx: &mut NodeExecutionContext, + jobs: ShaderBatchDatas, + depth_handle: DeclaredDependency, + camera_idx: CameraIndex, + ) -> DrawCallSet where M: Material, { @@ -217,38 +563,56 @@ impl GpuCuller { .jobs .iter() .map(|j: &ShaderBatchData| { - debug_assert_eq!(j.total_invocations % 256, 0); + debug_assert_eq!(j.total_invocations % WORKGROUP_SIZE, 0); j.total_invocations }) .sum(); - let buffers = ctx - .data_core - .graph_storage - .get_mut(&self.culling_buffer_map_handle) - .get_buffers( - &ctx.renderer.device, - camera, - CullingBuffers { - object_reference: jobs.jobs.size().get(), - index: ::max(total_invocations as u64 * 3 * 4, 4), - }, - ) - .clone(); + let encoder = ctx.encoder_or_pass.take_encoder(); + + let mut culling_buffer_map = ctx.data_core.graph_storage.get_mut(&self.culling_buffer_map_handle); + let buffers = culling_buffer_map.get_or_resize_buffers( + &ctx.renderer.queue, + &ctx.renderer.device, + encoder, + camera_idx, + CullingBufferSizes { + invocations: total_invocations as u64, + draw_calls: jobs.regions.len() as u64, + }, + ); + + let per_camera_uniform = Arc::clone( + ctx.data_core + .graph_storage + .get_mut(&self.per_material_buffer_handle) + .get(&camera_idx) + .unwrap_or_else(|| panic!("No per camera uniform for camera {:?}", camera_idx)), + ); + + let culling_data_buffer = { + profiling::scope!("Culling Job Data Upload"); + + let culling_data_buffer = ctx.renderer.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Culling Data Buffer"), + size: jobs.jobs.size().get(), + usage: wgpu::BufferUsages::STORAGE, + mapped_at_creation: true, + }); - { - profiling::scope!("Culling Job Data Format"); - let mut buffer = ctx - .renderer - .queue - .write_buffer_with(&buffers.object_reference, 0, jobs.jobs.size()) - .unwrap(); - StorageBuffer::new(&mut *buffer).write(&jobs.jobs).unwrap(); - } + let mut mapping = culling_data_buffer.slice(..).get_mapped_range_mut(); + StorageBuffer::new(&mut *mapping).write(&jobs.jobs).unwrap(); + drop(mapping); + culling_data_buffer.unmap(); - let bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor { + culling_data_buffer + }; + + let hi_z_buffer = ctx.graph_data.get_render_target(depth_handle); + + let culling_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor { label: Some(&format_sso!("GpuCuller {type_name} BG")), - layout: &self.bgl, + layout: &self.culling_bgl, entries: &[ BindGroupEntry { binding: 0, @@ -260,15 +624,35 @@ impl GpuCuller { }, BindGroupEntry { binding: 2, - resource: wgpu::BindingResource::Buffer(BufferBinding { - buffer: &buffers.object_reference, + resource: BindingResource::Buffer(BufferBinding { + buffer: &culling_data_buffer, offset: 0, size: Some(ShaderBatchData::SHADER_SIZE), }), }, BindGroupEntry { binding: 3, - resource: buffers.index.as_entire_binding(), + resource: buffers.draw_call_buffer.as_entire_binding(), + }, + BindGroupEntry { + binding: 4, + resource: buffers.index_buffer.as_entire_binding(), + }, + BindGroupEntry { + binding: 5, + resource: buffers.culling_results_buffer.as_entire_binding(), + }, + BindGroupEntry { + binding: 6, + resource: per_camera_uniform.as_entire_binding(), + }, + BindGroupEntry { + binding: 7, + resource: BindingResource::TextureView(hi_z_buffer), + }, + BindGroupEntry { + binding: 8, + resource: BindingResource::Sampler(&self.sampler), }, ], }); @@ -289,11 +673,7 @@ impl GpuCuller { current_material_key_range_start = range_end; } - let job = &jobs.jobs[region.job_index as usize]; - let start = (job.base_output_invocation + region.base_invocation) * 3; - let end = start + (region.invocation_count * 3); draw_calls.push(DrawCall { - index_range: start..end, bind_group_index: region.key.bind_group_index, batch_index: region.job_index, }); @@ -301,57 +681,82 @@ impl GpuCuller { material_key_ranges.insert(current_material_key, current_material_key_range_start..draw_calls.len()); - let mut cpass = ctx - .encoder_or_pass - .take_encoder() - .begin_compute_pass(&ComputePassDescriptor { - label: Some(&format_sso!("GpuCuller {type_name} Culling")), - timestamp_writes: None, - }); - cpass.set_pipeline(&self.pipeline); + encoder.clear_buffer(&buffers.draw_call_buffer, 8, None); + let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor { + label: Some(&format_sso!("GpuCuller {type_name} Culling")), + timestamp_writes: None, + }); + + cpass.set_pipeline(&self.culling_pipeline); for (idx, job) in jobs.jobs.iter().enumerate() { - cpass.set_bind_group(0, &bg, &[idx as u32 * ShaderBatchData::SHADER_SIZE.get() as u32]); - cpass.dispatch_workgroups(round_up_div(job.total_invocations, WORKGROUP_SIZE), 1, 1); + // RA can't infer this + let job: &ShaderBatchData = job; + + cpass.set_bind_group( + 0, + &culling_bg, + &[idx as u32 * ShaderBatchData::SHADER_SIZE.get() as u32], + ); + cpass.dispatch_workgroups(job.total_invocations.div_round_up(WORKGROUP_SIZE), 1, 1); } drop(cpass); DrawCallSet { - buffers, + culling_data_buffer, + per_camera_uniform, draw_calls, material_key_ranges, } } + pub fn add_object_uniform_upload_to_graph<'node, M: Material>( + &'node self, + graph: &mut RenderGraph<'node>, + camera_idx: CameraIndex, + resolution: UVec2, + samples: SampleCount, + name: &str, + ) { + let mut node = graph.add_node(name); + node.add_side_effect(); + + node.build(move |mut ctx| { + let camera = match camera_idx { + CameraIndex::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera, + CameraIndex::Viewport => &ctx.data_core.camera_manager, + }; + + self.object_uniform_upload::(&mut ctx, camera, camera_idx, resolution, samples); + }); + } + pub fn add_culling_to_graph<'node, M: Material>( &'node self, graph: &mut RenderGraph<'node>, - draw_calls_hdl: DataHandle, - camera: Option, + draw_calls_hdl: DataHandle>, + depth_handle: RenderTargetHandle, + camera_idx: CameraIndex, name: &str, ) { let mut node = graph.add_node(name); let output = node.add_data(draw_calls_hdl, NodeResourceUsage::Output); + let depth_handle = node.add_render_target(depth_handle, NodeResourceUsage::Input); node.build(move |mut ctx| { - let camera_manager = match camera { - Some(i) => &ctx.eval_output.shadows[i].camera, - None => &ctx.data_core.camera_manager, + let camera = match camera_idx { + CameraIndex::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera, + CameraIndex::Viewport => &ctx.data_core.camera_manager, }; - let jobs = batch_objects::( - &ctx.data_core.material_manager, - &ctx.data_core.object_manager, - camera_manager, - ctx.renderer.limits.max_compute_workgroups_per_dimension, - ); + let jobs = batch_objects::(&mut ctx, &self.previous_invocation_map_handle, camera, camera_idx); if jobs.jobs.is_empty() { return; } - let draw_calls = self.cull::(&mut ctx, jobs, camera); + let draw_calls = self.cull::(&mut ctx, jobs, depth_handle, camera_idx); - ctx.graph_data.set_data(output, Some(draw_calls)); + ctx.graph_data.set_data(output, Some(Arc::new(draw_calls))); }); } } diff --git a/rend3-routine/src/culling/mod.rs b/rend3-routine/src/culling/mod.rs index ce90aa48..805aa218 100644 --- a/rend3-routine/src/culling/mod.rs +++ b/rend3-routine/src/culling/mod.rs @@ -1,8 +1,10 @@ const BATCH_SIZE: usize = 256; -const WORKGROUP_SIZE: u32 = 256; +const WORKGROUP_SIZE: u32 = 64; mod batching; mod culler; +mod suballoc; pub use batching::{ShaderBatchData, ShaderBatchDatas}; -pub use culler::{DrawCall, DrawCallSet, GpuCuller}; +pub use culler::{CullingBufferMap, DrawCall, DrawCallSet, GpuCuller}; +pub use suballoc::{InputOutputBuffer, InputOutputPartition}; diff --git a/rend3-routine/src/culling/suballoc.rs b/rend3-routine/src/culling/suballoc.rs new file mode 100644 index 00000000..467b2c2d --- /dev/null +++ b/rend3-routine/src/culling/suballoc.rs @@ -0,0 +1,227 @@ +use std::{ + ops::{Deref, Range}, + sync::Arc, +}; + +use encase::{internal::WriteInto, ShaderType, StorageBuffer}; +use rend3::util::{math::IntegerExt, typedefs::SsoString}; +use wgpu::CommandEncoder; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum InputOutputPartition { + Input, + Output, +} + +#[derive(Debug)] +pub struct InputOutputBuffer { + /// Label for the buffer + label: SsoString, + /// Current active buffer + buffer: Arc, + /// Amount of elements reserved in the buffer for data, not including the header. + capacity_elements: u64, + /// Size of output partition + output_partition_elements: u64, + /// Size of input partition + input_partition_elements: u64, + /// When false, output partition is comes first. + /// + /// When true, input partition comes first. + flipped: bool, + /// Clear on swap + /// + /// When true, the data in both partitions will be cleared when the buffer + /// is swapped. + clear_on_swap: bool, + /// The size of each element in the buffer. This allows the user to provide sizes in element counts only. + /// + /// Must be a multiple of `element_alignment`. + element_size: u64, + /// Size of the header, including padding. + padded_header_size: u64, +} + +impl Deref for InputOutputBuffer { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.buffer + } +} + +impl InputOutputBuffer { + const USAGES: wgpu::BufferUsages = wgpu::BufferUsages::STORAGE + .union(wgpu::BufferUsages::COPY_DST) + .union(wgpu::BufferUsages::COPY_SRC) + .union(wgpu::BufferUsages::INDEX) + .union(wgpu::BufferUsages::INDIRECT); + + // The size of the header, including padding + fn padded_header_size(element_alignment: u64) -> u64 { + const HEADER_SIZE: u64 = 8; + HEADER_SIZE.round_up(element_alignment) + } + + fn capacity_elements(input_partition_elements: u64, output_partition_elements: u64) -> u64 { + let max = input_partition_elements.max(output_partition_elements); + max.next_power_of_two() * 2 + } + + fn buffer_size(padded_header_size: u64, capacity_elements: u64, element_size: u64) -> u64 { + capacity_elements * element_size + padded_header_size + } + + pub fn new( + device: &wgpu::Device, + queue: &wgpu::Queue, + partition_elements: u64, + label: &str, + element_size: u64, + element_alignment: u64, + clear_on_swap: bool, + ) -> Self { + let element_size = element_size.round_up(element_alignment); + let capacity_elements = Self::capacity_elements(partition_elements, partition_elements); + let padded_header_size = Self::padded_header_size(element_alignment); + let buffer_length = Self::buffer_size(padded_header_size, capacity_elements, element_size); + + let buffer = Arc::new(device.create_buffer(&wgpu::BufferDescriptor { + label: Some(label), + size: buffer_length, + usage: Self::USAGES, + mapped_at_creation: false, + })); + + let this = Self { + label: SsoString::from(label), + buffer, + capacity_elements, + output_partition_elements: partition_elements, + input_partition_elements: partition_elements, + flipped: false, + clear_on_swap, + element_size, + padded_header_size, + }; + + this.write_headers(queue); + + this + } + + /// Returns the offset in bytes for a given element in the given partition + pub fn element_offset(&self, partition: InputOutputPartition, element: u64) -> u64 { + let partition_offset = match partition { + InputOutputPartition::Input => self.input_partition_offset(), + InputOutputPartition::Output => self.output_partition_offset(), + }; + self.padded_header_size + partition_offset + element * self.element_size + } + + pub fn partition_slice(&self, partition: InputOutputPartition) -> Range { + let partition_offset = match partition { + InputOutputPartition::Input => self.input_partition_offset(), + InputOutputPartition::Output => self.output_partition_offset(), + }; + let partition_elements = match partition { + InputOutputPartition::Input => self.input_partition_elements, + InputOutputPartition::Output => self.output_partition_elements, + }; + let partition_size = partition_elements * self.element_size; + let slice_start = self.padded_header_size + partition_offset; + let slice_end: u64 = slice_start + partition_size; + slice_start..slice_end + } + + pub fn write_to_output(&self, queue: &wgpu::Queue, data: &T) { + assert_eq!(data.size().get(), self.output_partition_elements * self.element_size); + let mut mapping = queue + .write_buffer_with( + &self.buffer, + self.element_offset(InputOutputPartition::Output, 0), + data.size(), + ) + .unwrap(); + StorageBuffer::new(&mut *mapping).write(data).unwrap(); + drop(mapping); + } + + /// Returns the offset in bytes to get to the start of the output partition, not including the header. + fn output_partition_offset(&self) -> u64 { + if self.flipped { + (self.capacity_elements * self.element_size) / 2 + } else { + 0 + } + } + + /// Returns the offset in bytes to get to the start of the input partition, not including the header. + fn input_partition_offset(&self) -> u64 { + if self.flipped { + 0 + } else { + (self.capacity_elements * self.element_size) / 2 + } + } + + pub fn swap( + &mut self, + queue: &wgpu::Queue, + device: &wgpu::Device, + encoder: &mut CommandEncoder, + new_partition_elements: u64, + ) { + // Offset of the output partition in the old buffer. + let old_output_partition_offset = self.output_partition_offset(); + + // The output of last frame is now the input of this frame. + self.input_partition_elements = self.output_partition_elements; + // The new output is of the given size. + self.output_partition_elements = new_partition_elements; + // We're now flipped. + self.flipped = !self.flipped; + + // Gather a new data capcity + let new_capacity_elements = + Self::capacity_elements(self.input_partition_elements, self.output_partition_elements); + + if new_capacity_elements != self.capacity_elements { + // Set the capacity reserved + self.capacity_elements = new_capacity_elements; + let new_buffer = device.create_buffer(&wgpu::BufferDescriptor { + label: Some(&self.label), + size: Self::buffer_size(self.padded_header_size, new_capacity_elements, self.element_size), + usage: Self::USAGES, + mapped_at_creation: false, + }); + if !self.clear_on_swap { + // We copy the old output partition to the input partition of the new buffer. + // + // Note that we call output_partition_offset before we change any internal parameters, + // as we need the old buffer offsets. + encoder.copy_buffer_to_buffer( + &self.buffer, + old_output_partition_offset + self.padded_header_size, + &new_buffer, + self.input_partition_offset() + self.padded_header_size, + self.input_partition_elements * self.element_size, + ); + } + // We now set the new buffer. + self.buffer = Arc::new(new_buffer); + } else if self.clear_on_swap { + encoder.clear_buffer(&self.buffer, self.padded_header_size, None); + } + + self.write_headers(queue) + } + + fn write_headers(&self, queue: &wgpu::Queue) { + let offsets = [ + (self.output_partition_offset() / self.element_size) as u32, + (self.input_partition_offset() / self.element_size) as u32, + ]; + queue.write_buffer(&self.buffer, 0, bytemuck::cast_slice(&offsets)); + } +} diff --git a/rend3-routine/src/forward.rs b/rend3-routine/src/forward.rs index 92711ac9..c33fa69c 100644 --- a/rend3-routine/src/forward.rs +++ b/rend3-routine/src/forward.rs @@ -2,7 +2,7 @@ //! //! Will default to the PBR shader code if custom code is not specified. -use std::marker::PhantomData; +use std::{marker::PhantomData, sync::Arc}; use arrayvec::ArrayVec; use encase::ShaderSize; @@ -11,21 +11,21 @@ use rend3::{ DataHandle, NodeResourceUsage, RenderGraph, RenderPassDepthTarget, RenderPassTarget, RenderPassTargets, RenderTargetHandle, }, - types::{Handedness, Material, SampleCount}, - util::bind_merge::BindGroupBuilder, + types::{GraphDataHandle, Material, SampleCount}, + util::{bind_merge::BindGroupBuilder, typedefs::FastHashMap}, ProfileData, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, }; use serde::Serialize; use wgpu::{ BindGroup, BindGroupLayout, Color, ColorTargetState, ColorWrites, CompareFunction, DepthBiasState, - DepthStencilState, Face, FragmentState, FrontFace, IndexFormat, MultisampleState, PipelineLayoutDescriptor, - PolygonMode, PrimitiveState, PrimitiveTopology, RenderPipeline, RenderPipelineDescriptor, ShaderModule, - StencilState, TextureFormat, VertexState, + DepthStencilState, FragmentState, IndexFormat, MultisampleState, PipelineLayoutDescriptor, PolygonMode, + PrimitiveState, PrimitiveTopology, RenderPipeline, RenderPipelineDescriptor, ShaderModule, StencilState, + TextureFormat, VertexState, }; use crate::{ - common::{PerMaterialArchetypeInterface, WholeFrameInterfaces}, - culling::{self, DrawCall}, + common::{CameraIndex, PerMaterialArchetypeInterface, WholeFrameInterfaces}, + culling::{self, CullingBufferMap, DrawCall, DrawCallSet, InputOutputPartition}, }; #[derive(Serialize)] @@ -50,7 +50,7 @@ pub struct ShaderModulePair<'a> { pub struct RoutineArgs<'a, M> { pub name: &'a str, - pub renderer: &'a Renderer, + pub renderer: &'a Arc, pub data_core: &'a mut RendererDataCore, pub spp: &'a ShaderPreProcessor, @@ -61,6 +61,8 @@ pub struct RoutineArgs<'a, M> { pub routine_type: RoutineType, pub shaders: ShaderModulePair<'a>, + pub culling_buffer_map_handle: GraphDataHandle, + pub extra_bgls: &'a [&'a BindGroupLayout], #[allow(clippy::type_complexity)] pub descriptor_callback: Option<&'a dyn Fn(&mut RenderPipelineDescriptor<'_>, &mut [Option])>, @@ -69,7 +71,10 @@ pub struct RoutineArgs<'a, M> { pub struct RoutineAddToGraphArgs<'a, 'node, M> { pub graph: &'a mut RenderGraph<'node>, pub whole_frame_uniform_bg: DataHandle, - pub culled: DataHandle, + // If this is None, we are rendering the first pass with the predicted triangles from last frame. + // + // If this is Some, we are rendering the second pass with the residual triangles from this frame. + pub culling_output_handle: Option>>, pub per_material: &'node PerMaterialArchetypeInterface, pub extra_bgs: Option<&'node [BindGroup]>, pub label: &'a str, @@ -77,8 +82,7 @@ pub struct RoutineAddToGraphArgs<'a, 'node, M> { pub color: Option, pub resolve: Option, pub depth: RenderTargetHandle, - /// Passed to the shader through the instance index. - pub data: u32, + pub camera: CameraIndex, } /// A set of pipelines for rendering a specific combination of a material. @@ -86,6 +90,8 @@ pub struct ForwardRoutine { pub pipeline_s1: RenderPipeline, pub pipeline_s4: RenderPipeline, pub material_key: u64, + pub culling_buffer_map_handle: GraphDataHandle, + pub draw_call_set_cache_handle: GraphDataHandle>>, pub _phantom: PhantomData, } impl ForwardRoutine { @@ -130,12 +136,13 @@ impl ForwardRoutine { pipeline_s1: build_forward_pipeline_inner(&pll, &args, SampleCount::One), pipeline_s4: build_forward_pipeline_inner(&pll, &args, SampleCount::Four), material_key: args.material_key, + draw_call_set_cache_handle: args.renderer.add_graph_data(FastHashMap::default()), + culling_buffer_map_handle: args.culling_buffer_map_handle, _phantom: PhantomData, } } /// Add the given routine to the graph with the given settings. - #[allow(clippy::too_many_arguments)] pub fn add_forward_to_graph<'node>(&'node self, args: RoutineAddToGraphArgs<'_, 'node, M>) { let mut builder = args.graph.add_node(args.label); @@ -162,24 +169,73 @@ impl ForwardRoutine { }); let whole_frame_uniform_handle = builder.add_data(args.whole_frame_uniform_bg, NodeResourceUsage::Input); - let cull_handle = builder.add_data(args.culled, NodeResourceUsage::Input); + let culling_output_handle = builder.add_optional_data(args.culling_output_handle, NodeResourceUsage::Input); builder.build(move |mut ctx| { let rpass = ctx.encoder_or_pass.take_rpass(rpass_handle); let whole_frame_uniform_bg = ctx.graph_data.get_data(ctx.temps, whole_frame_uniform_handle).unwrap(); - let culled = match ctx.graph_data.get_data(ctx.temps, cull_handle) { - Some(c) => c, - None => return, + + // We need to store the draw call set in a cache so that next frame's predicted pass can use it. + let mut draw_call_set_cache = ctx.data_core.graph_storage.get_mut(&self.draw_call_set_cache_handle); + + let draw_call_set = match culling_output_handle { + // If we are provided a culling output handle, we are rendering the second pass + // with the residual triangles from this frame. + Some(handle) => { + // If there is no draw call set for this camera in the cache, there isn't actually anything to render. + let Some(draw_call_set) = ctx.graph_data.get_data(ctx.temps, handle) else { + return; + }; + + // As we're in the residual, we need to store the draw call set for the next frame. + draw_call_set_cache.insert(args.camera, Arc::clone(draw_call_set)); + + draw_call_set + } + // If we are not provided a culling output handle, this mean we are rendering the first pass + // with the predicted triangles from last frame. + None => { + // If there is no draw call set for this camera in the cache, that means we have yet to actually render anything, + // so either no objects yet exist, or we are in the first frame. + let Some(draw_call_set) = draw_call_set_cache.get(&args.camera) else { + return; + }; + + draw_call_set + } + }; + let residual = culling_output_handle.is_some() && args.camera.is_viewport(); + + let culling_buffer_storage = ctx.data_core.graph_storage.get(&self.culling_buffer_map_handle); + + // If there are no culling buffers in storage yet, we are in the first frame. We depend on culling + // to render anything, so just bail at this point. + let Some(culling_buffers) = culling_buffer_storage.get_buffers(args.camera) else { + return; + }; + + // We need to actually clone ownership of the underlying buffers and add them to renderpass temps, + // so we can use them in the renderpass. + let index_buffer = ctx.temps.add(Arc::clone(&culling_buffers.index_buffer)); + let draw_call_buffer = ctx.temps.add(Arc::clone(&culling_buffers.draw_call_buffer)); + + // When we're rendering the residual data, we are post buffer flip. We want to be rendering using the + // "input" partition, as this is the partition that all same-frame data is in. + let partition = if residual { + InputOutputPartition::Input + } else { + InputOutputPartition::Output }; let per_material_bg = ctx.temps.add( BindGroupBuilder::new() .append_buffer(ctx.data_core.object_manager.buffer::().unwrap()) .append_buffer_with_size( - &culled.buffers.object_reference, + &draw_call_set.culling_data_buffer, culling::ShaderBatchData::SHADER_SIZE.get(), ) .append_buffer(&ctx.eval_output.mesh_buffer) + .append_buffer(&draw_call_set.per_camera_uniform) .append_buffer(ctx.data_core.material_manager.archetype_view::().buffer()) .build(&ctx.renderer.device, Some("Per-Material BG"), &args.per_material.bgl), ); @@ -188,8 +244,10 @@ impl ForwardRoutine { SampleCount::One => &self.pipeline_s1, SampleCount::Four => &self.pipeline_s4, }; - - rpass.set_index_buffer(culled.buffers.index.slice(..), IndexFormat::Uint32); + rpass.set_index_buffer( + index_buffer.slice(culling_buffers.index_buffer.partition_slice(partition)), + IndexFormat::Uint32, + ); rpass.set_pipeline(pipeline); rpass.set_bind_group(0, whole_frame_uniform_bg, &[]); if let Some(v) = args.extra_bgs { @@ -201,12 +259,18 @@ impl ForwardRoutine { rpass.set_bind_group(2, bg, &[]); } - let Some(range) = culled.material_key_ranges.get(&self.material_key) else { + // If there are no draw calls for this material, just bail. + let Some(range) = draw_call_set.material_key_ranges.get(&self.material_key) else { return; }; - for call in &culled.draw_calls[range.clone()] { + + for (range_relative_idx, call) in draw_call_set.draw_calls[range.clone()].iter().enumerate() { + // Help RA out let call: &DrawCall = call; + // Add the base of the range to the index to get the actual index + let idx = range_relative_idx + range.start; + // If we're in cpu driven mode, we need to update the texture bind group. if ctx.renderer.profile.is_cpu_driven() { rpass.set_bind_group( 2, @@ -219,7 +283,10 @@ impl ForwardRoutine { per_material_bg, &[call.batch_index * culling::ShaderBatchData::SHADER_SIZE.get() as u32], ); - rpass.draw_indexed(call.index_range.clone(), 0, args.data..args.data + 1); + rpass.draw_indexed_indirect( + draw_call_buffer, + culling_buffers.draw_call_buffer.element_offset(partition, idx as u64), + ); } }); } @@ -249,11 +316,11 @@ fn build_forward_pipeline_inner( primitive: PrimitiveState { topology: PrimitiveTopology::TriangleList, strip_index_format: None, - front_face: match args.renderer.handedness { - Handedness::Left => FrontFace::Cw, - Handedness::Right => FrontFace::Ccw, - }, - cull_mode: Some(Face::Back), + front_face: args.renderer.handedness.into(), + cull_mode: Some(match args.routine_type { + RoutineType::Depth => wgpu::Face::Front, + RoutineType::Forward => wgpu::Face::Back, + }), unclipped_depth: false, polygon_mode: PolygonMode::Fill, conservative: false, @@ -264,9 +331,10 @@ fn build_forward_pipeline_inner( depth_compare: CompareFunction::GreaterEqual, stencil: StencilState::default(), bias: match args.routine_type { + // TODO: figure out what to put here RoutineType::Depth => DepthBiasState { - constant: -2, - slope_scale: -2.0, + constant: 0, + slope_scale: 0.0, clamp: 0.0, }, RoutineType::Forward => DepthBiasState::default(), diff --git a/rend3-routine/src/hi_z.rs b/rend3-routine/src/hi_z.rs new file mode 100644 index 00000000..39f6d0b2 --- /dev/null +++ b/rend3-routine/src/hi_z.rs @@ -0,0 +1,276 @@ +use std::borrow::Cow; + +use glam::UVec2; +use rend3::{ + graph::{ + DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderPassDepthTarget, + RenderPassHandle, RenderPassTargets, RenderTargetHandle, ViewportRect, + }, + Renderer, ShaderPreProcessor, +}; +use wgpu::{ + BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, BindGroupLayoutEntry, + BindingResource, BindingType, CompareFunction, DepthBiasState, DepthStencilState, Extent3d, FragmentState, + MultisampleState, PipelineLayoutDescriptor, PrimitiveState, RenderPipeline, RenderPipelineDescriptor, + ShaderModuleDescriptor, ShaderStages, StencilState, TextureDimension, TextureFormat, TextureSampleType, + TextureViewDimension, VertexState, +}; + +use crate::base::DepthTargets; + +pub struct HiZRoutine { + multisampled_bgl: BindGroupLayout, + single_sampled_bgl: BindGroupLayout, + downscale_pipeline: RenderPipeline, + resolve_pipeline: RenderPipeline, +} + +impl HiZRoutine { + pub fn new(renderer: &Renderer, spp: &ShaderPreProcessor) -> Self { + let resolve_source = spp + .render_shader( + "rend3-routine/resolve_depth_min.wgsl", + &serde_json::json!({"SAMPLES": 4}), + None, + ) + .unwrap(); + let downscale_source = spp.render_shader("rend3-routine/hi_z.wgsl", &(), None).unwrap(); + + let resolve_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { + label: Some("HiZ Resolver"), + source: wgpu::ShaderSource::Wgsl(Cow::Owned(resolve_source)), + }); + let downscale_sm = renderer.device.create_shader_module(ShaderModuleDescriptor { + label: Some("HiZ Downscaler"), + source: wgpu::ShaderSource::Wgsl(Cow::Owned(downscale_source)), + }); + + let multisampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { + label: Some("Multi Sample HiZ Texture BGL"), + entries: &[BindGroupLayoutEntry { + binding: 0, + visibility: ShaderStages::VERTEX_FRAGMENT, + ty: BindingType::Texture { + sample_type: TextureSampleType::Depth, + view_dimension: TextureViewDimension::D2, + multisampled: true, + }, + count: None, + }], + }); + + let single_sampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor { + label: Some("Single Sample HiZ Texture BGL"), + entries: &[BindGroupLayoutEntry { + binding: 0, + visibility: ShaderStages::VERTEX_FRAGMENT, + ty: BindingType::Texture { + sample_type: TextureSampleType::Depth, + view_dimension: TextureViewDimension::D2, + multisampled: false, + }, + count: None, + }], + }); + + let resolve_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { + label: Some("HiZ Resolve PLL"), + bind_group_layouts: &[&multisampled_bgl], + push_constant_ranges: &[], + }); + + let downscale_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor { + label: Some("HiZ Downscale PLL"), + bind_group_layouts: &[&single_sampled_bgl], + push_constant_ranges: &[], + }); + + let resolve_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor { + label: Some("HiZ Resolve Pipeline"), + layout: Some(&resolve_pipline_layout), + vertex: VertexState { + module: &resolve_sm, + entry_point: "vs_main", + buffers: &[], + }, + primitive: PrimitiveState::default(), + depth_stencil: Some(DepthStencilState { + format: TextureFormat::Depth32Float, + depth_write_enabled: true, + depth_compare: CompareFunction::Always, + stencil: StencilState::default(), + bias: DepthBiasState::default(), + }), + multisample: MultisampleState::default(), + fragment: Some(FragmentState { + module: &resolve_sm, + entry_point: "fs_main", + targets: &[], + }), + multiview: None, + }); + + let downscale_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor { + label: Some("HiZ Downscale Pipeline"), + layout: Some(&downscale_pipline_layout), + vertex: VertexState { + module: &downscale_sm, + entry_point: "vs_main", + buffers: &[], + }, + primitive: PrimitiveState::default(), + depth_stencil: Some(DepthStencilState { + format: TextureFormat::Depth32Float, + depth_write_enabled: true, + depth_compare: CompareFunction::Always, + stencil: StencilState::default(), + bias: DepthBiasState::default(), + }), + multisample: MultisampleState::default(), + fragment: Some(FragmentState { + module: &downscale_sm, + entry_point: "fs_main", + targets: &[], + }), + multiview: None, + }); + + Self { + single_sampled_bgl, + downscale_pipeline, + multisampled_bgl, + resolve_pipeline, + } + } + + pub fn resolve<'pass>( + &'pass self, + mut ctx: NodeExecutionContext<'_, 'pass, '_>, + renderpass_handle: DeclaredDependency, + source_handle: DeclaredDependency, + ) { + let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle); + let source = ctx.graph_data.get_render_target(source_handle); + + let bind_group = ctx + .temps + .add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor { + label: Some("HiZ Resolve BG"), + layout: &self.multisampled_bgl, + entries: &[BindGroupEntry { + binding: 0, + resource: BindingResource::TextureView(source), + }], + })); + + rpass.set_pipeline(&self.resolve_pipeline); + rpass.set_bind_group(0, bind_group, &[]); + rpass.draw(0..3, 0..1); + } + + pub fn downscale<'pass>( + &'pass self, + mut ctx: NodeExecutionContext<'_, 'pass, '_>, + renderpass_handle: DeclaredDependency, + source_handle: DeclaredDependency, + ) { + let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle); + let source = ctx.graph_data.get_render_target(source_handle); + + let bind_group = ctx + .temps + .add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor { + label: Some("HiZ Bind Group Layout"), + layout: &self.single_sampled_bgl, + entries: &[BindGroupEntry { + binding: 0, + resource: BindingResource::TextureView(source), + }], + })); + + rpass.set_pipeline(&self.downscale_pipeline); + rpass.set_bind_group(0, bind_group, &[]); + rpass.draw(0..3, 0..1); + } + + pub fn add_hi_z_to_graph<'node>( + &'node self, + graph: &mut RenderGraph<'node>, + depth_targets: DepthTargets, + resolution: UVec2, + ) { + let extent = Extent3d { + width: resolution.x, + height: resolution.y, + depth_or_array_layers: 1, + }; + let mips = extent.max_mips(TextureDimension::D2) as u8; + + // First we need to downscale the depth buffer to a single sample texture + // if we are doing multisampling. + if let Some(multi_sample) = depth_targets.multi_sample { + let mut node = graph.add_node("HiZ Resolve"); + + let target = node.add_render_target( + depth_targets.single_sample_mipped.set_mips(0..1), + NodeResourceUsage::Output, + ); + + let source = node.add_render_target(multi_sample, NodeResourceUsage::Output); + + let rpass_handle = node.add_renderpass(RenderPassTargets { + targets: vec![], + depth_stencil: Some(RenderPassDepthTarget { + target, + depth_clear: Some(0.0), + stencil_clear: None, + }), + }); + + node.add_side_effect(); + + node.build(move |ctx| { + self.resolve(ctx, rpass_handle, source); + }); + } + + for dst_mip in 1..mips { + let src_mip = dst_mip - 1; + + let mut node = graph.add_node(&format!("HiZ Mip {src_mip} -> {dst_mip}")); + + let dst_extent = extent.mip_level_size(dst_mip as u32, TextureDimension::D2); + let src_extent = extent.mip_level_size(src_mip as u32, TextureDimension::D2); + + let dst_target = node.add_render_target( + depth_targets + .single_sample_mipped + .set_mips(dst_mip..dst_mip + 1) + .set_viewport(ViewportRect::from_size(UVec2::new(dst_extent.width, dst_extent.height))), + NodeResourceUsage::Output, + ); + let src_target = node.add_render_target( + depth_targets + .single_sample_mipped + .set_mips(src_mip..src_mip + 1) + .set_viewport(ViewportRect::from_size(UVec2::new(src_extent.width, src_extent.height))), + NodeResourceUsage::Input, + ); + + let rpass_handle = node.add_renderpass(RenderPassTargets { + targets: vec![], + depth_stencil: Some(RenderPassDepthTarget { + target: dst_target, + depth_clear: Some(0.0), + stencil_clear: None, + }), + }); + + node.add_side_effect(); + + node.build(move |ctx| { + self.downscale(ctx, rpass_handle, src_target); + }); + } + } +} diff --git a/rend3-routine/src/lib.rs b/rend3-routine/src/lib.rs index b09cbc4d..9f13361b 100644 --- a/rend3-routine/src/lib.rs +++ b/rend3-routine/src/lib.rs @@ -23,6 +23,7 @@ pub mod clear; pub mod common; pub mod culling; pub mod forward; +pub mod hi_z; pub mod pbr; mod shaders; pub mod skinning; diff --git a/rend3-routine/src/pbr/routine.rs b/rend3-routine/src/pbr/routine.rs index 36878e37..f54e7914 100644 --- a/rend3-routine/src/pbr/routine.rs +++ b/rend3-routine/src/pbr/routine.rs @@ -1,12 +1,16 @@ -use std::borrow::Cow; +use std::{borrow::Cow, sync::Arc}; -use rend3::{Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig}; +use rend3::{ + types::GraphDataHandle, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig, +}; use serde::Serialize; use wgpu::{BlendState, ShaderModuleDescriptor, ShaderSource}; use crate::{ common::{PerMaterialArchetypeInterface, WholeFrameInterfaces}, + culling::CullingBufferMap, forward::{ForwardRoutine, RoutineArgs, RoutineType, ShaderModulePair}, + hi_z::HiZRoutine, pbr::{PbrMaterial, TransparencyType}, }; @@ -23,15 +27,17 @@ pub struct PbrRoutine { pub opaque_routine: ForwardRoutine, pub cutout_routine: ForwardRoutine, pub blend_routine: ForwardRoutine, + pub hi_z: HiZRoutine, pub per_material: PerMaterialArchetypeInterface, } impl PbrRoutine { pub fn new( - renderer: &Renderer, + renderer: &Arc, data_core: &mut RendererDataCore, spp: &ShaderPreProcessor, interfaces: &WholeFrameInterfaces, + culling_buffer_map_handle: &GraphDataHandle, ) -> Self { profiling::scope!("PbrRenderRoutine::new"); @@ -125,6 +131,7 @@ impl PbrRoutine { targets[0].as_mut().unwrap().blend = Some(BlendState::ALPHA_BLENDING) } }), + culling_buffer_map_handle: culling_buffer_map_handle.clone(), }) }; @@ -134,6 +141,7 @@ impl PbrRoutine { opaque_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Opaque), cutout_routine: inner(RoutineType::Forward, &pbr_cutout, TransparencyType::Cutout), blend_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Blend), + hi_z: HiZRoutine::new(renderer, spp), per_material, } } diff --git a/rend3-routine/src/shaders.rs b/rend3-routine/src/shaders.rs index e59ac7db..89f1ac85 100644 --- a/rend3-routine/src/shaders.rs +++ b/rend3-routine/src/shaders.rs @@ -23,7 +23,8 @@ mod tests { }, }; use naga::WithSpan; - use rend3::{RendererProfile, ShaderConfig, ShaderPreProcessor, ShaderVertexBufferConfig}; + use rend3::{RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig}; + use serde_json::json; use crate::{pbr::PbrMaterial, shaders::Rend3RoutineShaderSources}; @@ -70,16 +71,24 @@ mod tests { let source = pp.get(shader).unwrap(); let configs = if source.contains("#if") { - &[ - ShaderConfig { - profile: Some(RendererProfile::CpuDriven), - }, - ShaderConfig { - profile: Some(RendererProfile::GpuDriven), - }, - ][..] + vec![ + json!({ + "profile": Some(RendererProfile::GpuDriven), + "position_attribute_offset": 0, + "SAMPLES": 1, + }), + json!({ + "profile": Some(RendererProfile::CpuDriven), + "position_attribute_offset": 0, + "SAMPLES": 1, + }), + ] } else { - &[ShaderConfig { profile: None }][..] + vec![json!({ + "profile": Some(RendererProfile::CpuDriven), + "position_attribute_offset": 0, + "SAMPLES": 1, + })] }; if source.contains("DO NOT VALIDATE") { @@ -87,12 +96,11 @@ mod tests { } for config in configs { - let serialized_config = serde_json::to_value(config).unwrap(); - println!("Testing shader {shader} with config {serialized_config:?}"); + println!("Testing shader {shader} with config {config:?}"); let output = pp.render_shader( shader, - config, + &config, Some(&ShaderVertexBufferConfig::from_material::()), ); diff --git a/rend3-routine/src/skinning.rs b/rend3-routine/src/skinning.rs index 3990338e..e15d2513 100644 --- a/rend3-routine/src/skinning.rs +++ b/rend3-routine/src/skinning.rs @@ -10,7 +10,7 @@ use rend3::{ }, util::{ bind_merge::{BindGroupBuilder, BindGroupLayoutBuilder}, - math::round_up_div, + math::div_round_up, }, ShaderPreProcessor, }; @@ -199,7 +199,7 @@ impl GpuSkinner { let offset = (i as u64 * GpuSkinningInput::SHADER_SIZE.get()) as u32; cpass.set_bind_group(0, &bg, &[offset]); - let num_workgroups = round_up_div(skel.vertex_count, Self::WORKGROUP_SIZE); + let num_workgroups = div_round_up(skel.vertex_count, Self::WORKGROUP_SIZE); cpass.dispatch_workgroups(num_workgroups, 1, 1); } } diff --git a/rend3-test/Cargo.toml b/rend3-test/Cargo.toml index 6dddcd74..8ce5668b 100644 --- a/rend3-test/Cargo.toml +++ b/rend3-test/Cargo.toml @@ -15,6 +15,7 @@ name = "rend3-tests" [dependencies] anyhow = "1" +env_logger = "0.10" flume = { version = "0.11", features = ["spin"] } glam = "0.24" image = { version = "0.24", default-features = false, features = ["png"] } diff --git a/rend3-test/src/helpers.rs b/rend3-test/src/helpers.rs index c56a6627..59dad809 100644 --- a/rend3-test/src/helpers.rs +++ b/rend3-test/src/helpers.rs @@ -10,6 +10,12 @@ use crate::TestRunner; pub struct CaptureDropGuard { device: Arc, } +impl CaptureDropGuard { + pub fn start_capture(device: Arc) -> Self { + device.start_capture(); + Self { device } + } +} impl Drop for CaptureDropGuard { fn drop(&mut self) { self.device.stop_capture(); @@ -19,13 +25,6 @@ impl Drop for CaptureDropGuard { } impl TestRunner { - pub fn start_capture(&self) -> CaptureDropGuard { - self.device.start_capture(); - CaptureDropGuard { - device: self.device.clone(), - } - } - pub fn add_directional_light(&self, direction: Vec3) -> DirectionalLightHandle { self.renderer.add_directional_light(rend3::types::DirectionalLight { color: glam::Vec3::ONE, diff --git a/rend3-test/src/lib.rs b/rend3-test/src/lib.rs index 3c286d78..1438b668 100644 --- a/rend3-test/src/lib.rs +++ b/rend3-test/src/lib.rs @@ -1,7 +1,9 @@ mod helpers; mod runner; +mod threshold; pub use runner::{FrameRenderSettings, TestRunner}; +pub use threshold::{Threshold, ThresholdSet}; #[macro_export] macro_rules! no_gpu_return { diff --git a/rend3-test/src/runner.rs b/rend3-test/src/runner.rs index e5f21856..b5ddd5f3 100644 --- a/rend3-test/src/runner.rs +++ b/rend3-test/src/runner.rs @@ -14,6 +14,8 @@ use wgpu::{ Extent3d, ImageCopyBuffer, ImageDataLayout, TextureDescriptor, TextureDimension, TextureFormat, TextureUsages, }; +use crate::{helpers::CaptureDropGuard, ThresholdSet}; + #[derive(Clone)] pub struct FrameRenderSettings { size: u32, @@ -68,6 +70,8 @@ impl TestRunnerBuilder { } pub async fn build(self) -> Result { + let _ = env_logger::try_init(); + let iad = match self.iad { Some(iad) => iad, None => rend3::create_iad(None, None, None, None) @@ -75,6 +79,8 @@ impl TestRunnerBuilder { .context("InstanceAdapterDevice creation failed")?, }; + let capture_guard = CaptureDropGuard::start_capture(Arc::clone(&iad.device)); + let renderer = rend3::Renderer::new(iad, self.handness.unwrap_or(Handedness::Left), None) .context("Renderer initialization failed")?; let mut spp = rend3::ShaderPreProcessor::new(); @@ -87,6 +93,7 @@ impl TestRunnerBuilder { &mut renderer.data_core.lock(), &spp, &base_rendergraph.interfaces, + &base_rendergraph.gpu_culler.culling_buffer_map_handle, ); let tonemapping = TonemappingRoutine::new( &renderer, @@ -100,6 +107,7 @@ impl TestRunnerBuilder { pbr, tonemapping, base_rendergraph, + capture_guard, }) } } @@ -109,6 +117,7 @@ pub struct TestRunner { pub pbr: PbrRoutine, pub tonemapping: TonemappingRoutine, pub base_rendergraph: BaseRenderGraph, + pub capture_guard: CaptureDropGuard, } impl Deref for TestRunner { @@ -148,6 +157,7 @@ impl TestRunner { let frame_handle = graph.add_imported_render_target( &texture, 0..1, + 0..1, rend3::graph::ViewportRect::from_size(UVec2::splat(settings.size)), ); @@ -223,7 +233,12 @@ impl TestRunner { .context("Failed to create image from mapping") } - pub fn compare_image_to_path(&self, test_rgba: &image::RgbaImage, path: &Path, threshold: f32) -> Result<()> { + pub fn compare_image_to_path( + &self, + test_rgba: &image::RgbaImage, + path: &Path, + threshold: impl Into, + ) -> Result<()> { #[cfg(not(target_arch = "wasm32"))] { let parent_path = path.parent().context("Path given had no parent")?; @@ -249,11 +264,11 @@ impl TestRunner { let mut pool = nv_flip::FlipPool::from_image(&result_float); - let mean: f32 = pool.mean(); - - let pass = mean <= threshold; + println!("Image Comparison Results"); + let threshold_set: ThresholdSet = threshold.into(); + let pass = threshold_set.check(&mut pool); - println!("Image Comparison Results: {}", if pass { "passed" } else { "failed" }); + println!(); println!(" Mean: {}", pool.mean()); println!(" Min: {}", pool.min_value()); println!(" 25%: {}", pool.get_percentile(0.25, true)); @@ -262,6 +277,8 @@ impl TestRunner { println!(" 95%: {}", pool.get_percentile(0.95, true)); println!(" 99%: {}", pool.get_percentile(0.99, true)); println!(" Max: {}", pool.max_value()); + println!("{}", if pass { "Passed!" } else { "Failed!" }); + println!(); let filename = path.file_stem().unwrap(); @@ -286,7 +303,7 @@ impl TestRunner { &self, settings: FrameRenderSettings, path: impl AsRef, - threshold: f32, + threshold: impl Into, ) -> Result<()> { let test_rgba = self.render_frame(settings).await?; diff --git a/rend3-test/src/threshold.rs b/rend3-test/src/threshold.rs new file mode 100644 index 00000000..e987faaf --- /dev/null +++ b/rend3-test/src/threshold.rs @@ -0,0 +1,73 @@ +pub struct ThresholdSet { + #[cfg_attr(target_arch = "wasm32", allow(unused))] + thresholds: Vec, +} + +impl ThresholdSet { + #[cfg(not(target_arch = "wasm32"))] + pub fn check(self, pool: &mut nv_flip::FlipPool) -> bool { + // If there are no checks, we want to fail the test. + let mut all_passed = !self.thresholds.is_empty(); + // We always iterate all of these, as the call to check prints + for check in self.thresholds { + all_passed &= check.check(pool); + } + all_passed + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum Threshold { + Mean(f32), + Percentile { percentile: f32, threshold: f32 }, +} + +impl Threshold { + #[cfg(not(target_arch = "wasm32"))] + fn check(&self, pool: &mut nv_flip::FlipPool) -> bool { + match *self { + Self::Mean(v) => { + let mean = pool.mean(); + let within = mean <= v; + println!( + " Expected Mean ({:.6}) to be under expected maximum ({}): {}", + mean, + v, + if within { "PASS" } else { "FAIL" } + ); + within + } + Self::Percentile { + percentile: p, + threshold: v, + } => { + let percentile = pool.get_percentile(p, true); + let within = percentile <= v; + println!( + " Expected {}% ({:.6}) to be under expected maximum ({}): {}", + p * 100.0, + percentile, + v, + if within { "PASS" } else { "FAIL" } + ); + within + } + } + } +} + +impl From for ThresholdSet { + fn from(threshold: Threshold) -> Self { + Self { + thresholds: vec![threshold], + } + } +} + +impl From<&[Threshold]> for ThresholdSet { + fn from(thresholds: &[Threshold]) -> Self { + Self { + thresholds: thresholds.into(), + } + } +} diff --git a/rend3-test/tests/msaa.rs b/rend3-test/tests/msaa.rs index 6bff054b..5c5da7ff 100644 --- a/rend3-test/tests/msaa.rs +++ b/rend3-test/tests/msaa.rs @@ -1,7 +1,7 @@ use anyhow::Context; use glam::{Mat4, Vec3, Vec4}; use rend3::types::{Camera, Handedness, MeshBuilder, Object, ObjectMeshKind, SampleCount}; -use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner}; +use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold}; #[test_attr] pub async fn triangle() -> anyhow::Result<()> { @@ -47,7 +47,7 @@ pub async fn triangle() -> anyhow::Result<()> { .render_and_compare( FrameRenderSettings::new().samples(SampleCount::Four), "tests/results/msaa/four.png", - 0.0, + Threshold::Mean(0.0), ) .await?; @@ -96,7 +96,7 @@ pub async fn sample_coverage() -> anyhow::Result<()> { .render_and_compare( FrameRenderSettings::new().samples(samples), &format!("tests/results/msaa/sample-coverage-{}.png", samples as u8), - 0.0, + Threshold::Mean(0.0), ) .await?; } diff --git a/rend3-test/tests/object.rs b/rend3-test/tests/object.rs new file mode 100644 index 00000000..dceabd12 --- /dev/null +++ b/rend3-test/tests/object.rs @@ -0,0 +1,61 @@ +use anyhow::Context; +use glam::{Mat4, Vec3, Vec4}; +use rend3::{ + types::{Camera, Handedness}, + util::freelist::FreelistDerivedBuffer, +}; +use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold}; + +/// There was a bug in the culling implementation where the per-material buffer +/// was never resized to fit the number of objects in the scene once it was initially +/// created. This manifested as objects above the initial frame count would get all-zero +/// transforms and be completely hidden. We reproduce those conditions here, and ensure +/// that the bug is fixed. +#[test_attr] +pub async fn multi_frame_add() -> anyhow::Result<()> { + let iad = no_gpu_return!(rend3::create_iad(None, None, None, None).await) + .context("InstanceAdapterDevice creation failed")?; + + let Ok(runner) = TestRunner::builder() + .iad(iad.clone()) + .handedness(Handedness::Left) + .build() + .await + else { + return Ok(()); + }; + + let material = runner.add_unlit_material(Vec4::ONE); + + // Make a plane whose (0, 0) is at the top left, and is 1 unit large. + let base_matrix = Mat4::from_translation(Vec3::new(0.5, 0.5, 0.0)) * Mat4::from_scale(Vec3::new(0.5, 1.0, 1.0)); + + runner.set_camera_data(Camera { + projection: rend3::types::CameraProjection::Raw(Mat4::orthographic_lh(0.0, 2.0, 16.0, 0.0, 0.0, 1.0)), + view: Mat4::IDENTITY, + }); + + // We use the starting size amount of objects for each column, ensuring that the buffer + // will need to be resized on the second column. + let count = FreelistDerivedBuffer::STARTING_SIZE; + + // 2 side by side columns made up of `count` rows + let mut planes = Vec::with_capacity(2); + for x in 0..2 { + for y in 0..count { + planes.push(runner.plane( + material.clone(), + Mat4::from_translation(Vec3::new(x as f32, y as f32, 0.0)) * base_matrix, + )); + } + runner + .render_and_compare( + FrameRenderSettings::new(), + &format!("tests/results/object/multi-frame-add-{}.png", x), + Threshold::Mean(0.0), + ) + .await?; + } + + Ok(()) +} diff --git a/rend3-test/tests/results/object/multi-frame-add-0.png b/rend3-test/tests/results/object/multi-frame-add-0.png new file mode 100644 index 0000000000000000000000000000000000000000..9abacaa79d63abd2617654bd1ab62812a6598f09 GIT binary patch literal 439 zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV65|WaSW+oeEWcLf{N!P6;Gk= zRWom;0XwS?QCs zHTR#ir_le}6@UJ_J^gR4^?!fh&;Pxr{^y7Mt6%)-zx34q_lK_3e~zc?Ss9mAF`ofO O2!p4qpUXO@geCy?n+mT0 literal 0 HcmV?d00001 diff --git a/rend3-test/tests/results/object/multi-frame-add-1.png b/rend3-test/tests/results/object/multi-frame-add-1.png new file mode 100644 index 0000000000000000000000000000000000000000..fe6c076e7aca24deafa2839e35f6da65263006e8 GIT binary patch literal 435 zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV65_VaSW+oeEWcLf{N!P6;Gk= zRWom;0XwS?QCs zHTU2DMW6mlPyK&?#h?FfPyd^1{ofz>^MCKD|M?;R>K6}LsXxlkRvo39IUBX^stj0B9-FM$nuiv)SaE9$GpF+!j{(- zk{!bqr^ja;CGwWgydFg*iIPW3?0$YXXaDT;**?Gfx~}ir@47p8m26KpS%aIAm_9~B zl_|unEtH(Q{Fhf{68E#N3a^<78_TS+J#VY?+;oh=e-)q#>%d8e7;(z3;Qlu=4m;$?)hInWU8+ zn3TZ1-LK|^9msrfKzP8I=DaWG`W`4Omn6lCP6;UM@2_7N*vHAeDnn`5N!%Hz8cO$F zjUhfOgMLV5-A+Du8RbW{y@4IWHo;dWVb~xB?X6>_P-SP9=T#QTNzE>=e3jj0=Z6PPJ)$)d7qj<r|KaUVr5+b{$|a|)=2z%GDqGgiX?6C)QF`*-pg{Ah(4_|0(MHf8YMHG zONtlggO=ipJ`D>~glc($>e+Lzgb|dy|8&UXBiA%gWOeUhQ*OD^iyeqB;h446&r$L4 zG54PcymDm#p%wv@Pb2LVNCQj`%FU+2IAB^Q?QXT9pEnhi9{Y zb3d#zRlN4*!-ZM7&mohXgqKK72q|i;%DUD->F;;lr{aNA?!}ihQz_y_i1+fI9uun$ zx*6&}pc(2|GHQRl=(D`<*B9ML)s=axv?-|m&4Kt}lOa0(1=U>nAouQdB69et7nx6F zPPe!*r%56Yrs~U2e>uFi`7HOTh+pkE;Its20P1CyIgGG7fkgrpf7AF+y}XynE2Q9q zGc6^~SR;a_^yB1%!$ttefo94o6iElJHEgz%WVN5Z5`nS%HX53%pIN)Y%yE({-GZ?O z&#shYrT0^>YK!vc+H49y56>5IE;~wL*+iBaexFeAR+#|S;d|YXgnYfKXjThtPh_ZFyi#VhmIil zBXmnr!q|c474ADXEd2cO_5@UY^MDlwIAOIk4MRtxWR1-sLRY%8=St$@ds?{k_L6o6Xz=@qCcy5>Gb80TNYKkA1Ok3OtTv)5hox}xJ(bE8m z_ab_Rx?ouwpKO;-O$J4NA?JyDQ{Ob29(@@6E&u+IeX}yYnr<)wC$e^UFPD4Ny&okq zoiPlWV2Cd+`C`5PLf22BqN!kmU-QkFZ$2%J(Cw4q9fLLm-rzi#weWnO{1~50%;ly` z2lk3f1RVqN*i_aIX?|qvtMJNi`D0(THx%*{!-%s`W36wMBvJ?fI3ix$P^b9^bd*m|M6k1yuI`*y#mtM-G`dm+86`s22$f zq8$7kgW-WxSZ6VR*b2#;W~a8f(TxdT0kpt`+t|+5F>ML6{f(O zuet?ES`y=M@Um9X5vD>i{IDe=mIrSSqmJdSlY={7KOWYyGN;9XBtowd0s^KE4xORawm^5n1OKzE z70jbH!n&fP;ze&DCs^H66Jjt<1%9VWr^-N`qzRHVaE~A|RvE4=2A6)+oHpBp2O!gN zWdKJZ=6I5@GQ1_lLR$RAI&~Ki6Lb0!p!nCfyGdrLf0q{- z3ouI~00a`9F!VAYkItYE0n|ffhHXu%ZBWv}>_^Vh^>vYzm*ezHzR6GhY&kG&+qnvL z8VsJik{CnQ39+*k)=hhY-h?RVB0Zd z8S$P+gHrh&yb`lcpNZj({#H?R88bN2T}+ z*AuNNk^EG8<==Y+66lqIDCT}pWa_TJ+vKlbF4dPF`7o6MYET${^F>I3eJkmNanAZ3 zqXTI@ziz5<922$JAmdcI)6qx(BanL|>zk{QCt&sRp99?Gq+kna;Kc|Y*l_F}m)~-} z?8+1I4V8anH}9!_Z023x0_-67<#Ta&%r*(^dIy29*T;ZNLY(AZdyB2J!Ny?5YG~NZ zny0QLXIoMb(?LLWQxLRq^<)k77) zGfnkLKoD1(yJs#!xJKpUYP#?>h*i6Po~S`YRbAQ_p_~Q2kxHv>{Mae*4ME1Oz!;d&rXNuhS_sYvMnXeYr|%QFYiIRK>qSV zb&$Y}qA0_(v?-O{X4T!-Gr}oPkn37M`{r3n+HL4jh!wsf#jQLJqT%ck}JvO zN_QXuI1w#6%B-+YCo(JezIFMub_yj&qH&FecFrM#YZXN-WF@d-jp9k&Dn=S~g8I6V zw-gkXq2e^XZ^oQQy20A90f(Xg55*u5NYRHK1;Afhz-DmiN1`Y5O`Qc7(sJbw&JGi} p6K1ny7T5?LW}Q%(sxH&bP{B1ha$3Uce-WX;$JOO9T}(T{{Xcm!!zus( literal 5966 zcmeHLeK?fq{(mquO-+q4mUd`|p4F%}D*Hrf4Le>&5wn#}5lhD$8LhWUxwaXkjJIWo zm6B>F;xv(1%Q}ekBH5zy)^BTU*_5`{%0|rj-p^3&p6mR1uIqRGerK-B#XRQu-rxKC zdAsM4d3n0hY46i845JI&_(BZRfYfCI$3`*^7clb z35oN6Z~Tu8r~XRl_x&^S5f0A3#yX6(sT@A|KaI-$5+hlMZCiaaF?Tz=*FGccsH~%} zooH_(i1tRJYA>hg>N;_hL$CeHUf`jQ*LQ=7=hIe1ShR3;1YLR@cSojY&_XOHiMrfC z6i;XB$Av975Hq+sSg+g3P8u&ba3OYrr@uYaHmemkr{&RCm#5+1VC*24qorZ5twAq` zYRCxSab%^F{y^rZQRrf^>|r-hdz;Q?cPB zOB}?;pofM$`~32>uxBE+7=x;Zxrx^)Y#h(fcIM1W0)DnrcCtAmK*zWj>(WaNvo=mh(|khc zy?(%i7NE7Opo0wZDdpmYYVuBVzEvOQPtQ&1c-xlJYhqyq>Q0>^7Gg{w}2qF zSZtv9b>H0izx!X-lMTF_`@ONxM5zXXj(~=iyws*6BR934Jk)QU@C#<=Uni=PxLvX^ z4A1;1B&<1vWzPC0D`a!WX_;`b_oAwh{)WNu<7W+u9*bsWlo|9MpzuNs!j6=ASQ-j0 z5X+Rg+e4!(?2_Xvmi;|vUDJ`FEK%{qY_T{fAKhJcRlSPoGjXU8K>85s-Ee{NIawx( z)7FZ93Ec85Dq6qh5}`Q0V*b!)V|Mofbp#hew%6^+6fdY4%Hp5>$)^01YAWF!J1Ev! zJ@2cRSao=K=FWcOXZ+OPPvWVNVOGr$2AI&^M#-w^=7&{$%sR z!4I*4xl3<-F@HgkEW|eOA5qE0rrP%(_>QRGJ6yUAPj-5|YOu3oY+gm0N7Z`2Rz+>@ z_I%^q@zmBYv7YKc-)E<|KiP!Wk4);abVdF9?dY>R8lu==Esa2z4i-xz2<*Ja-$$*a z6k;^ANlq-;yR+%RhtIyPiJs$p$xdtTK#BTWs&d~6an{+)?gkg8kfHctWW}S^MP96e z{EAE{7DY^cbkyKR%W&DNZ^J#Iula{dCpQ9~MG+RQb;Gu7u=jdKBO=L3mQ$QokT__e zo4BAl=ayA9czhyNIcD@%i^mkUI6OWLXE|m=AWT^CAC?$h(K~8uAPQVqRqheIoF00| zbq4GT+(}}FNX;~k!)DH|Do;BG)3)#FhlI!l#wW`8d`D?mS-C-PwomyBZSsWpqR2=qKo3mhq+AI0Cmg7N?9(?3R0CtVD<_ z|70)?cP16lwabOJ!=dO$`-3Q#)UzO@63^2KtP2(pE$HR5@7OM!Uc*bytz59evhozr z#!%WcYc!+d3fiS}`0e^DgudgM!@}5_?xWo;?_E~bC33-aH7i{sZs1!36L21%wd2Q* zFOKnO(-r?lp{AA_tiHPL&J3gSEcE##vNLa*yg`pGev0Eny8ow-2f(=0P8-mOL&0&n zS_tC|FRJqc7?u&~ISI#Xv#vLTJUxPge9vt*eY))I97tPrVzKXtwg9`xMCDdTSajIl zy5ow92`N5xUh*U%W)14q`dJhOTuADPn4UnS#o3RaQrKWr$j$oRC&34za$s#zAJw`+dhysX&U)M2dKUu40q>(9gk9kMg&g)ITue z!a-xCo`q-`cK+j|bSx1|;(}-W*!MuPqWAUsfc({D2eR z^0#5#uC8E{JHJ)v2JPXqb5NJdHz@aI@>#`BOnpbbqgk0uxcSB-6Tj+1rIX`bE=-@g zD{#Ac3iX>gmS2Za6isRJiNM0;a>|Qc=X?f3@R<(uaDMou^ixd|xm0cvxNI+>kyl_1lz^B#X1^GQ>g@WkMDi_HS74*d?!8Qw zgw2{LHtld2cz*F-z_7z%95rT~L6OD~Zz^KYVnLg<+KGwkLvwit!tl?btV%bldeHw6 z?Awgx=*^LJJ7xEH*A_I@rJ&#Gr|MEtU1MLb-wth{rt-BS63P0Qo9Zv(3~P=uqj>_2 zRe}>fuem4o z^tE>#+L!(kP&qar(MD4)`bxwj(fC5$qt5`F%F*c4Z_c*{)?XS|x zIDl_s01xddd-G!NJK3I+<`z26xe=iwlGsWv1`d?P|Mx+uKJjC;p$Blx~@jg z9@HnGMrP`$Xe6ZAG`@da!`I?-4acA$0}8HnrU(zQuCI(2uHFG+xuxq4bZ@`;Z13q2 zGqpmTf$R$GgmWOY`SDCu$1R(CNpX#z`UGY9+s#HZ<>(f@wce_TO=~i z_Ck+fB`Krwjw_NX>XiO4z>}hvSbbe@CSY8U07@olgcLZR2uWgS)cv9l37iI7-)hO1 zA%UYEuXot$<*wLm>Ka=+dxp`g_9@v&H9*Tm%Q#A}H$+OqaBEJ%p$;yEX|@!R7kP$+ z3cLS`p^zTHiK?$YBiUZ9fyeI9z%P#+Qr?mGTaYMHFgDflnHX3%;LHMAB2p>G=7L|S zmzisy`IHu%;-T5`S|dwz9tAV~I%oW~D=E1@*#ijhA~e)oF^}QnMiwLF7(+^DTBXv# znSTbRJ0#=oXSv$W3Yxsv!>v<0i)WA-JLsIZeFUm5OGODW#jYR-W55Isqu#?LqnDUnj|}(m;sKV z-V7&@PibJkz@gtN?8>)V`dyAxGoREAzmDzZyf7iTgKP@}|J`ew=7cti#7r7HOj9;a z?!H8A4X23@tTe)YVyt)q36yYe7z}cJ9!RD!@AX#8=`AAJ&SjZB8?d2i6*5RN!5ooF zflOIxbQ&U)9_n3BR=O=33i~_icT$fuuX8+WTA6>dZK)6lw8k}e>GJu0svj>xci`kG zwJ@Y;gen<9pvzeXQxQaw0!cwx1FotP34HsRI49iUtj}$3@jUSo+^uCYF)F*K{79Q! zO^lU#3#ky&`oXCePsh$5Lnv1Owg+a%@R^taXD0P3{kTpuT{yHsixDx8wNc#khmV?s zec7CHP19lw@ebYO_Kk|C zod*e91c{)DUR8;jfzViQ<{r&(($7|&B9oyLzWAlT+&yp3(qH3&A{du~y~%ska>)C4 zc!PXXYI6Y7AS{n@oDOXD3imT^aX7I%-Mgk>Cm~X+?A(Vozg&^Nw6AIS6hjmFszh&% zHP9mR(zGCMRm*|TP-4jx;3Ul8JIb-Uh~AbYCc|lkyG<*f*!9)A!FN=V$xwl4i|KPX z)1XM91SB-;2&TN??Wq8OBEu&RoPt6EJrxV>nWXDFnv}w^@X5eafBUs(OiL5imXMWLN&3v<8w+4#Iexi>1JR5qm@cO#KnNkYGq0vb>? zF^hr$wfBykLr6KZKqeKJ^g7BO@g)Y~R%DR4y@mKKo^Y_zIKzuRlIFK1#-NOPhJqtI z9S|Fob;rWGq0W9~kvCPqA>G#+9p&7xZ_JvbP{t!+PQ!Nndc|-`C>7 z7aGa^OEL2shnel~+2_Yp-QO!4SdY#y?E-rVFv%7@%&X69-l$&l|L=L86CCdU`vmiU e&EwtQUE|jnoXU@!`8_-*#{|xve5F%B{67IOfms9q diff --git a/rend3-test/tests/results/simple/empty.png b/rend3-test/tests/results/simple/empty.png new file mode 100644 index 0000000000000000000000000000000000000000..dfac4a664f2ba7cb7161734ce7e528a42f2daaf0 GIT binary patch literal 434 zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV660XaSW+oeEWcLf{N!P6;Gk= zRWom;0XwS?QDX z^uM{*|NVhK|M#BypC9tCe(|UO(o_H6U-9R^+fbG|X3z9!0V88cM^rX2E*Lyr{an^L HB{Ts5<$4H# literal 0 HcmV?d00001 diff --git a/rend3-test/tests/root.rs b/rend3-test/tests/root.rs index 45080131..ab1b6f0f 100644 --- a/rend3-test/tests/root.rs +++ b/rend3-test/tests/root.rs @@ -1,3 +1,4 @@ mod msaa; +mod object; mod shadow; mod simple; diff --git a/rend3-test/tests/shadow.rs b/rend3-test/tests/shadow.rs index ed717418..36ab0943 100644 --- a/rend3-test/tests/shadow.rs +++ b/rend3-test/tests/shadow.rs @@ -3,7 +3,7 @@ use std::f32::consts::FRAC_PI_2; use anyhow::Context; use glam::{Mat4, Quat, Vec3, Vec3A, Vec4}; use rend3::types::{Camera, Handedness}; -use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner}; +use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold}; #[test_attr] pub async fn shadows() -> anyhow::Result<()> { @@ -34,7 +34,14 @@ pub async fn shadows() -> anyhow::Result<()> { let file_name = "tests/results/shadow/plane.png"; runner - .render_and_compare(FrameRenderSettings::new().size(256)?, file_name, 0.02) + .render_and_compare( + FrameRenderSettings::new().size(256)?, + file_name, + Threshold::Percentile { + percentile: 0.5, + threshold: 0.04, + }, + ) .await?; let material2 = runner.add_lit_material(Vec4::new(0.75, 0.5, 0.25, 1.0)); @@ -46,7 +53,14 @@ pub async fn shadows() -> anyhow::Result<()> { let file_name = "tests/results/shadow/cube.png"; runner - .render_and_compare(FrameRenderSettings::new().size(256)?, file_name, 0.02) + .render_and_compare( + FrameRenderSettings::new().size(256)?, + file_name, + Threshold::Percentile { + percentile: 0.5, + threshold: 0.04, + }, + ) .await?; Ok(()) diff --git a/rend3-test/tests/simple.rs b/rend3-test/tests/simple.rs index 65e28753..684c69ce 100644 --- a/rend3-test/tests/simple.rs +++ b/rend3-test/tests/simple.rs @@ -1,9 +1,35 @@ use anyhow::Context; use glam::{Mat4, Vec3, Vec4}; use rend3::types::{Camera, Handedness, MeshBuilder, Object, ObjectMeshKind}; -use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner}; +use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold}; use wgpu::FrontFace; +#[test_attr] +pub async fn empty() -> anyhow::Result<()> { + let iad = no_gpu_return!(rend3::create_iad(None, None, None, None).await) + .context("InstanceAdapterDevice creation failed")?; + + let Ok(runner) = TestRunner::builder().iad(iad).build().await else { + return Ok(()); + }; + + runner.set_camera_data(Camera { + projection: rend3::types::CameraProjection::Raw(Mat4::IDENTITY), + view: Mat4::IDENTITY, + }); + + runner + .render_and_compare( + FrameRenderSettings::new(), + "tests/results/simple/empty.png", + Threshold::Mean(0.0), + ) + .await + .context("Image Comparison Failed")?; + + Ok(()) +} + #[test_attr] pub async fn triangle() -> anyhow::Result<()> { let tests = [ @@ -67,8 +93,13 @@ pub async fn triangle() -> anyhow::Result<()> { false => "tests/results/simple/triangle-backface.png", }; runner - .render_and_compare(FrameRenderSettings::new(), file_name, 0.0) - .await?; + .render_and_compare(FrameRenderSettings::new(), file_name, Threshold::Mean(0.0)) + .await + .with_context(|| { + format!( + "Comparison failed on test (Handedness::{handedness:?}, FrontFace::{winding:?}, Visible: {visible})" + ) + })?; } Ok(()) @@ -135,8 +166,9 @@ pub async fn coordinate_space() -> anyhow::Result<()> { let file_name = format!("tests/results/simple/coordinate-space-{name}.png"); runner - .render_and_compare(FrameRenderSettings::new(), file_name, 0.0) - .await?; + .render_and_compare(FrameRenderSettings::new(), file_name, Threshold::Mean(0.0)) + .await + .with_context(|| format!("Comparison failed on test {name}"))?; } Ok(()) diff --git a/rend3-types/src/lib.rs b/rend3-types/src/lib.rs index 24694919..6397043e 100644 --- a/rend3-types/src/lib.rs +++ b/rend3-types/src/lib.rs @@ -194,6 +194,18 @@ pub type GraphDataHandleUntyped = ResourceHandle; /// Refcounted handle to an instance of GraphData pub struct GraphDataHandle(pub GraphDataHandleUntyped, pub PhantomData); +impl Debug for GraphDataHandle { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl Clone for GraphDataHandle { + fn clone(&self) -> Self { + Self(self.0.clone(), PhantomData) + } +} + /// Internal non-owning handle to a Mesh pub type RawMeshHandle = RawResourceHandle; /// Internal non-owning handle to a Texture2D @@ -252,7 +264,9 @@ pub use wgt::{ /// /// The value allows for 8 bits of information packed in the high 8 bits of the /// index for object recombination. -pub const MAX_VERTEX_COUNT: u32 = 1 << 24; +/// +/// We leave exactly one value at the top for the "invalid vertex" value: 0x00_FF_FF_FF; +pub const MAX_VERTEX_COUNT: u32 = (1 << 24) - 1; /// The maximum amount of indices any one object can have. pub const MAX_INDEX_COUNT: u32 = u32::MAX; @@ -1172,8 +1186,8 @@ impl SampleCount { pub const ARRAY: [Self; 2] = [Self::One, Self::Four]; /// Determines if a resolve texture is needed for this texture. - pub fn needs_resolve(self) -> bool { - self != Self::One + pub const fn needs_resolve(self) -> bool { + !matches!(self, Self::One) } } @@ -1193,6 +1207,15 @@ pub enum Handedness { Right, } +impl From for wgt::FrontFace { + fn from(value: Handedness) -> Self { + match value { + Handedness::Left => Self::Cw, + Handedness::Right => Self::Ccw, + } + } +} + impl Default for Handedness { fn default() -> Self { Self::Left diff --git a/rend3/shaders/vertex_attributes.wgsl b/rend3/shaders/vertex_attributes.wgsl index 663d1f94..baa5bf52 100644 --- a/rend3/shaders/vertex_attributes.wgsl +++ b/rend3/shaders/vertex_attributes.wgsl @@ -1,8 +1,43 @@ // -- DO NOT VALIDATE -- struct Indices { - vertex: u32, object: u32, + vertex: u32, +} + +struct BatchIndices { + /// Index _within_ the batch + local_object: u32, + /// Vertex index within the object + vertex: u32, +} + +const INVALID_VERTEX: u32 = 0x00FFFFFFu; + +fn unpack_batch_index(vertex_index: u32) -> BatchIndices { + return BatchIndices( + vertex_index >> 24u, + vertex_index & 0xFFFFFFu, + ); +} + +fn pack_batch_index(local_object: u32, index: u32) -> u32 { + return (local_object << 24u) | (index & 0xFFFFFFu); +} + +alias TriangleVertices = array; +alias TriangleIndices = array; +struct Triangle { + vertices: TriangleVertices, + indices: TriangleIndices, +} + +fn pack_batch_indices(local_object: u32, indices: TriangleIndices) -> TriangleIndices { + return TriangleIndices( + pack_batch_index(local_object, indices[0]), + pack_batch_index(local_object, indices[1]), + pack_batch_index(local_object, indices[2]), + ); } fn extract_attribute_vec2_f32(byte_base_offset: u32, vertex_index: u32) -> vec2 { diff --git a/rend3/src/graph/data_handle.rs b/rend3/src/graph/data_handle.rs deleted file mode 100644 index 85ed170d..00000000 --- a/rend3/src/graph/data_handle.rs +++ /dev/null @@ -1,37 +0,0 @@ -use std::{ - ops::Deref, - sync::atomic::{AtomicUsize, Ordering}, -}; - -use once_cell::sync::Lazy; - -static DATA_HANDLE_INDEX_ALLOCATOR: AtomicUsize = AtomicUsize::new(0); - -pub struct RenderGraphConnection { - name: &'static str, - index: Lazy, -} -impl RenderGraphConnection { - pub const fn new(name: &'static str) -> Self { - Self { - name, - index: Lazy::new(|| DATA_HANDLE_INDEX_ALLOCATOR.fetch_add(1, Ordering::Relaxed)), - } - } - - pub const fn name(&self) -> &'static str { - self.name - } - - pub fn index(&self) -> usize { - *self.index - } -} - -impl Deref for RenderGraphConnection { - type Target = usize; - - fn deref(&self) -> &Self::Target { - &self.index - } -} diff --git a/rend3/src/graph/graph.rs b/rend3/src/graph/graph.rs index c5d2eb6d..6159b83d 100644 --- a/rend3/src/graph/graph.rs +++ b/rend3/src/graph/graph.rs @@ -112,6 +112,8 @@ impl<'node> RenderGraph<'node> { idx, layer_start: 0, layer_end: desc.depth, + mip_start: 0, + mip_end: desc.to_core().mip_count(), viewport: ViewportRect { offset: UVec2::ZERO, size: desc.resolution, @@ -126,6 +128,7 @@ impl<'node> RenderGraph<'node> { &mut self, texture: &'node dyn AsTextureReference, layers: Range, + mips: Range, viewport: ViewportRect, ) -> RenderTargetHandle { let idx = self.imported_targets.len(); @@ -135,6 +138,8 @@ impl<'node> RenderGraph<'node> { idx, layer_start: layers.start, layer_end: layers.end, + mip_start: mips.start, + mip_end: mips.end, viewport, }), } @@ -341,6 +346,8 @@ impl<'node> RenderGraph<'node> { let view = active_textures[®ion.idx].create_view(&TextureViewDescriptor { base_array_layer: region.layer_start, array_layer_count: Some(region.layer_end - region.layer_start), + base_mip_level: region.mip_start as u32, + mip_level_count: Some((region.mip_end - region.mip_start) as u32), ..TextureViewDescriptor::default() }); vacant.insert(view); @@ -354,6 +361,8 @@ impl<'node> RenderGraph<'node> { .create_view(&TextureViewDescriptor { base_array_layer: region.layer_start, array_layer_count: Some(region.layer_end - region.layer_start), + base_mip_level: region.mip_start as u32, + mip_level_count: Some((region.mip_end - region.mip_start) as u32), ..TextureViewDescriptor::default() }); vacant.insert(view); diff --git a/rend3/src/graph/mod.rs b/rend3/src/graph/mod.rs index 4d913eb5..cd85fa92 100644 --- a/rend3/src/graph/mod.rs +++ b/rend3/src/graph/mod.rs @@ -47,11 +47,10 @@ use std::ops::Range; use glam::UVec2; use rend3_types::{SampleCount, TextureFormat, TextureUsages}; -use wgpu::{Color, TextureView}; +use wgpu::{Color, Extent3d, TextureDimension, TextureView}; use crate::util::typedefs::SsoString; -mod data_handle; mod encpass; #[allow(clippy::module_inception)] // lmao mod graph; @@ -60,7 +59,6 @@ mod store; mod temp; mod texture_store; -pub use data_handle::*; pub use encpass::*; pub use graph::*; pub use node::*; @@ -75,6 +73,8 @@ pub struct RenderTargetDescriptor { pub resolution: UVec2, pub depth: u32, pub samples: SampleCount, + // None means maximum mip count + pub mip_levels: Option, pub format: TextureFormat, pub usage: TextureUsages, } @@ -84,6 +84,7 @@ impl RenderTargetDescriptor { resolution: self.resolution, depth: self.depth, samples: self.samples, + mip_levels: self.mip_levels, format: self.format, usage: self.usage, } @@ -95,10 +96,26 @@ pub(crate) struct RenderTargetCore { pub resolution: UVec2, pub depth: u32, pub samples: SampleCount, + pub mip_levels: Option, pub format: TextureFormat, pub usage: TextureUsages, } +impl RenderTargetCore { + fn mip_count(&self) -> u8 { + match self.mip_levels { + Some(count) => count, + None => Extent3d { + width: self.resolution.x, + height: self.resolution.y, + // D2 doesn't care about depth + depth_or_array_layers: 1, + } + .max_mips(TextureDimension::D2) as u8, + } + } +} + /// Requirements to render to a particular shadow map. /// /// view + size form the start/end of the viewport to render to. @@ -124,6 +141,8 @@ pub(super) struct TextureRegion { idx: usize, layer_start: u32, layer_end: u32, + mip_start: u8, + mip_end: u8, viewport: ViewportRect, } @@ -174,7 +193,11 @@ impl RenderTargetHandle { let left = self.to_region(); let right = other.to_region(); - left.idx == right.idx && left.layer_start == right.layer_start && left.layer_end == right.layer_end + left.idx == right.idx + && left.layer_start == right.layer_start + && left.layer_end == right.layer_end + && left.mip_start == right.mip_start + && left.mip_end == right.mip_end } pub(super) fn to_region(self) -> TextureRegion { @@ -184,11 +207,31 @@ impl RenderTargetHandle { } } - pub fn restrict(mut self, layers: Range, viewport: ViewportRect) -> Self { + pub fn set_layers(mut self, layers: Range) -> Self { match &mut self.resource { GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => { region.layer_start = layers.start; region.layer_end = layers.end; + } + _ => unreachable!(), + } + self + } + + pub fn set_mips(mut self, mips: Range) -> Self { + match &mut self.resource { + GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => { + region.mip_start = mips.start; + region.mip_end = mips.end; + } + _ => unreachable!(), + } + self + } + + pub fn set_viewport(mut self, viewport: ViewportRect) -> Self { + match &mut self.resource { + GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => { region.viewport = viewport; } _ => unreachable!(), diff --git a/rend3/src/graph/node.rs b/rend3/src/graph/node.rs index 4ade24e6..c1f29ea0 100644 --- a/rend3/src/graph/node.rs +++ b/rend3/src/graph/node.rs @@ -136,6 +136,21 @@ impl<'a, 'node> RenderGraphNodeBuilder<'a, 'node> { DeclaredDependency { handle } } + /// Sugar over [add_data] which makes it easy to + /// declare optional textures. + /// + /// [add_data]: RenderGraphNodeBuilder::add_data + pub fn add_optional_data( + &mut self, + handle: Option>, + usage: NodeResourceUsage, + ) -> Option>> + where + T: 'static, + { + Some(self.add_data(handle?, usage)) + } + /// Declares a data handle as having the given render targets pub fn add_dependencies_to_render_targets( &mut self, diff --git a/rend3/src/graph/texture_store.rs b/rend3/src/graph/texture_store.rs index 8822ddc5..07981fca 100644 --- a/rend3/src/graph/texture_store.rs +++ b/rend3/src/graph/texture_store.rs @@ -37,7 +37,7 @@ impl GraphTextureStore { height: desc.resolution.y, depth_or_array_layers: 1, }, - mip_level_count: 1, + mip_level_count: desc.mip_count() as u32, sample_count: desc.samples as _, dimension: TextureDimension::D2, format: desc.format, diff --git a/rend3/src/managers/graph_storage.rs b/rend3/src/managers/graph_storage.rs index 0e3888e0..c96d948b 100644 --- a/rend3/src/managers/graph_storage.rs +++ b/rend3/src/managers/graph_storage.rs @@ -1,11 +1,14 @@ -use std::{any::Any, ops::DerefMut}; +use std::{ + any::Any, + ops::{Deref, DerefMut}, +}; -use parking_lot::Mutex; +use parking_lot::RwLock; use rend3_types::{GraphDataHandle, RawGraphDataHandleUntyped, WasmNotSend}; #[derive(Default)] pub struct GraphStorage { - // Type under any is Mutex + // Type under any is RwLock #[cfg(not(target_arch = "wasm32"))] data: Vec>>, #[cfg(target_arch = "wasm32")] @@ -21,14 +24,21 @@ impl GraphStorage { if handle.idx >= self.data.len() { self.data.resize_with(handle.idx + 1, || None); } - self.data[handle.idx] = Some(Box::new(Mutex::new(data))); + self.data[handle.idx] = Some(Box::new(RwLock::new(data))); + } + + pub fn get(&self, handle: &GraphDataHandle) -> impl Deref + '_ { + let rw_lock: &RwLock = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap(); + rw_lock + .try_read() + .expect("Called get on the same handle that was already borrowed mutably within a renderpass") } pub fn get_mut(&self, handle: &GraphDataHandle) -> impl DerefMut + '_ { - let mutex: &Mutex = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap(); - mutex - .try_lock() - .expect("Tried to call get_mut on the same handle twice") + let rw_lock: &RwLock = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap(); + rw_lock + .try_write() + .expect("Tried to call get_mut on the same handle twice within a renderpass") } pub fn remove(&mut self, handle: &RawGraphDataHandleUntyped) { diff --git a/rend3/src/managers/handle_alloc.rs b/rend3/src/managers/handle_alloc.rs index c22d8180..20dba352 100644 --- a/rend3/src/managers/handle_alloc.rs +++ b/rend3/src/managers/handle_alloc.rs @@ -18,6 +18,15 @@ where { max_allocated: AtomicUsize, freelist: Mutex>, + /// We want the render routines to be able to rely on deleted handles being valid for at + /// least one frame. + /// + /// To facilitate this, we first put the handle in the delay list, then at the top of + /// every frame, we move the handles from the delay list to the freelist. + /// + /// We do not need to do this for everything though, only for Object handles, as these + /// are the root handle which the renderer accesses everything. + delay_list: Option>>, _phantom: PhantomData, } @@ -25,10 +34,11 @@ impl HandleAllocator where RawResourceHandle: DeletableRawResourceHandle, { - pub fn new() -> Self { + pub fn new(delay_handle_reclaimation: bool) -> Self { Self { max_allocated: AtomicUsize::new(0), freelist: Mutex::new(Vec::new()), + delay_list: delay_handle_reclaimation.then(|| Mutex::new(Vec::new())), _phantom: PhantomData, } } @@ -49,15 +59,24 @@ where pub fn deallocate(&self, handle: RawResourceHandle) { let idx = handle.idx; - self.freelist.lock().push(idx); + if let Some(ref delay_list) = self.delay_list { + delay_list.lock().push(idx); + } else { + self.freelist.lock().push(idx); + } } -} -impl Default for HandleAllocator -where - RawResourceHandle: DeletableRawResourceHandle, -{ - fn default() -> Self { - Self::new() + pub fn reclaim_delayed_handles(&self) -> Vec> { + if let Some(ref delay_list) = self.delay_list { + let mut locked_delay_list = delay_list.lock(); + + self.freelist.lock().extend_from_slice(&locked_delay_list); + locked_delay_list + .drain(..) + .map(|idx| RawResourceHandle::new(idx)) + .collect() + } else { + Vec::new() + } } } diff --git a/rend3/src/managers/object.rs b/rend3/src/managers/object.rs index c8197cd1..b2bbb345 100644 --- a/rend3/src/managers/object.rs +++ b/rend3/src/managers/object.rs @@ -31,6 +31,8 @@ pub struct ShaderObject { pub material_index: u32, pub vertex_attribute_start_offsets: >::U32Array, + // 1 if enabled, 0 if disabled + pub enabled: u32, } impl Default for ShaderObject { @@ -42,6 +44,7 @@ impl Default for ShaderObject { index_count: Default::default(), material_index: Default::default(), vertex_attribute_start_offsets: Zeroable::zeroed(), + enabled: Default::default(), } } } @@ -87,8 +90,8 @@ struct ObjectArchetype { buffer: FreelistDerivedBuffer, set_object_transform: fn(&mut WasmVecAny, &mut FreelistDerivedBuffer, usize, Mat4), duplicate_object: fn(&WasmVecAny, usize, ObjectChange) -> Object, - remove: fn(&mut WasmVecAny, usize), - evaluate: fn(&mut ObjectArchetype, &Device, &mut CommandEncoder, &ScatterCopy), + remove: fn(&mut ObjectArchetype, usize), + evaluate: fn(&mut ObjectArchetype, &Device, &mut CommandEncoder, &ScatterCopy, &[RawObjectHandle]), } /// Manages objects. That's it. ¯\\\_(ツ)\_/¯ @@ -167,14 +170,18 @@ impl ObjectManager { let archetype = self.archetype.get_mut(&type_id).unwrap(); - (archetype.remove)(&mut archetype.data_vec, handle.idx); - - archetype.object_count -= 1; + (archetype.remove)(archetype, handle.idx); } - pub fn evaluate(&mut self, device: &Device, encoder: &mut CommandEncoder, scatter: &ScatterCopy) { + pub fn evaluate( + &mut self, + device: &Device, + encoder: &mut CommandEncoder, + scatter: &ScatterCopy, + deferred_removals: &[RawObjectHandle], + ) { for archetype in self.archetype.values_mut() { - (archetype.evaluate)(archetype, device, encoder, scatter); + (archetype.evaluate)(archetype, device, encoder, scatter, deferred_removals); } } @@ -298,6 +305,7 @@ pub(super) fn object_add_callback(_material: &M, args: ObjectAddCal first_index: (index_range.start / 4) as u32, index_count: ((index_range.end - index_range.start) / 4) as u32, vertex_attribute_start_offsets, + enabled: true as u32, }, material_handle: args.object.material, mesh_kind: args.object.mesh_kind, @@ -345,10 +353,21 @@ fn duplicate_object(data: &WasmVecAny, idx: usize, change: ObjectCh } } -fn remove(data: &mut WasmVecAny, idx: usize) { - let data_vec = data.downcast_slice_mut::>>().unwrap(); +fn remove(archetype: &mut ObjectArchetype, idx: usize) { + let data_vec = archetype + .data_vec + .downcast_slice_mut::>>() + .unwrap(); - data_vec[idx] = None; + // We don't actually remove the object at this point, + // we just mark it as disabled. Next frame, this handle + // will be provided in `deferred_removals` in `evaluate` + // so we can actually delete it. + // + // We defer objects one frame so that temporal culling + // has valid data. + archetype.buffer.use_index(idx); + data_vec[idx].as_mut().unwrap().inner.enabled = false as u32; } fn evaluate( @@ -356,12 +375,23 @@ fn evaluate( device: &Device, encoder: &mut CommandEncoder, scatter: &ScatterCopy, + deferred_removals: &[RawObjectHandle], ) { let data_vec = archetype .data_vec - .downcast_slice::>>() + .downcast_slice_mut::>>() .unwrap(); + for removal in deferred_removals { + // Only one archetype will have each handle, + // so if we have it, we can be sure it's ours. + let removed_obj = data_vec[removal.idx].take(); + + if removed_obj.is_some() { + archetype.object_count -= 1; + } + } + archetype.buffer.apply(device, encoder, scatter, |idx| { data_vec[idx].as_ref().map(|o| o.inner).unwrap_or_default() }) diff --git a/rend3/src/renderer/eval.rs b/rend3/src/renderer/eval.rs index 18c68d10..aca3a623 100644 --- a/rend3/src/renderer/eval.rs +++ b/rend3/src/renderer/eval.rs @@ -11,6 +11,8 @@ pub fn evaluate_instructions(renderer: &Renderer) -> InstructionEvaluationOutput let mut instructions = renderer.instructions.consumer.lock(); + let delayed_object_handles = renderer.resource_handle_allocators.object.reclaim_delayed_handles(); + // 16 encoders is a reasonable default let mut cmd_bufs = Vec::with_capacity(16); @@ -171,9 +173,12 @@ pub fn evaluate_instructions(renderer: &Renderer) -> InstructionEvaluationOutput // Do these in dependency order // Level 3 - data_core - .object_manager - .evaluate(&renderer.device, &mut encoder, &renderer.scatter); + data_core.object_manager.evaluate( + &renderer.device, + &mut encoder, + &renderer.scatter, + &delayed_object_handles, + ); // Level 2 let d2_texture = data_core.d2_texture_manager.evaluate(&renderer.device); diff --git a/rend3/src/renderer/mod.rs b/rend3/src/renderer/mod.rs index d721bf21..68a9115a 100644 --- a/rend3/src/renderer/mod.rs +++ b/rend3/src/renderer/mod.rs @@ -65,7 +65,6 @@ pub struct Renderer { } /// Handle allocators -#[derive(Default)] struct HandleAllocators { pub mesh: HandleAllocator, pub skeleton: HandleAllocator, @@ -77,6 +76,21 @@ struct HandleAllocators { pub graph_storage: HandleAllocator, } +impl Default for HandleAllocators { + fn default() -> Self { + Self { + mesh: HandleAllocator::new(false), + skeleton: HandleAllocator::new(false), + d2_texture: HandleAllocator::new(false), + d2c_texture: HandleAllocator::new(false), + material: HandleAllocator::new(false), + object: HandleAllocator::new(true), + directional_light: HandleAllocator::new(false), + graph_storage: HandleAllocator::new(false), + } + } +} + /// All the mutex protected data within the renderer pub struct RendererDataCore { /// Position and settings of the camera. diff --git a/rend3/src/shader.rs b/rend3/src/shader.rs index 279f35c5..829f5016 100644 --- a/rend3/src/shader.rs +++ b/rend3/src/shader.rs @@ -20,6 +20,7 @@ struct Rend3ShaderSources; #[derive(Debug, Default, Serialize)] pub struct ShaderConfig { pub profile: Option, + pub position_attribute_offset: usize, } pub struct ShaderVertexBufferConfig { @@ -263,11 +264,10 @@ impl<'a> ShaderVertexBufferHelper<'a> { let unpack_function = format!( " fn unpack_vertex_index(vertex_index: u32) -> Indices {{ - let local_object_id = vertex_index >> 24u; - let vertex_id = vertex_index & 0xFFFFFFu; - let object_id = {batch_buffer}.ranges[local_object_id].object_id; + let batch_indices = unpack_batch_index(vertex_index); + let object_id = {batch_buffer}.object_culling_information[batch_indices.local_object].object_id; - return Indices(vertex_id, object_id); + return Indices(object_id, batch_indices.vertex); }}" ); @@ -354,7 +354,10 @@ mod tests { let mut pp = ShaderPreProcessor::new(); pp.add_shader("simple", "{{include \"other\"}} simple"); pp.add_shader("other", "other"); - let config = ShaderConfig { profile: None }; + let config = ShaderConfig { + profile: None, + position_attribute_offset: 0, + }; let output = pp.render_shader("simple", &config, None).unwrap(); assert_eq!(output, "other simple"); @@ -365,7 +368,10 @@ mod tests { let mut pp = ShaderPreProcessor::new(); pp.add_shader("simple", "{{include \"other\"}} simple"); pp.add_shader("other", "{{include \"simple\"}} other"); - let config = ShaderConfig { profile: None }; + let config = ShaderConfig { + profile: None, + position_attribute_offset: 0, + }; let output = pp.render_shader("simple", &config, None).unwrap(); assert_eq!(output, " other simple"); @@ -375,7 +381,10 @@ mod tests { fn error_include() { let mut pp = ShaderPreProcessor::new(); pp.add_shader("simple", "{{include \"other\"}} simple"); - let config = ShaderConfig { profile: None }; + let config = ShaderConfig { + profile: None, + position_attribute_offset: 0, + }; let output = pp.render_shader("simple", &config, None); assert!(output.is_err(), "Expected error, got {output:?}"); @@ -385,7 +394,10 @@ mod tests { fn no_arg_include() { let mut pp = ShaderPreProcessor::new(); pp.add_shader("simple", "{{include}} simple"); - let config = ShaderConfig { profile: None }; + let config = ShaderConfig { + profile: None, + position_attribute_offset: 0, + }; let output = pp.render_shader("simple", &config, None); assert!(output.is_err(), "Expected error, got {output:?}"); diff --git a/rend3/src/util/bind_merge.rs b/rend3/src/util/bind_merge.rs index 3258c2ad..97f1fd7f 100644 --- a/rend3/src/util/bind_merge.rs +++ b/rend3/src/util/bind_merge.rs @@ -96,6 +96,15 @@ impl<'a> BindGroupBuilder<'a> { self } + pub fn append_buffer_with_offset_and_size(&mut self, buffer: &'a Buffer, offset: u64, size: u64) -> &mut Self { + self.append(BindingResource::Buffer(BufferBinding { + buffer, + offset, + size: NonZeroU64::new(size), + })); + self + } + pub fn append_sampler(&mut self, sampler: &'a Sampler) -> &mut Self { self.append(BindingResource::Sampler(sampler)); self diff --git a/rend3/src/util/buffer.rs b/rend3/src/util/buffer.rs index bee9a28f..6613880b 100644 --- a/rend3/src/util/buffer.rs +++ b/rend3/src/util/buffer.rs @@ -1,6 +1,6 @@ //! Automatic management of Power-of-Two sized buffers. -use std::{marker::PhantomData, ops::Deref}; +use std::{marker::PhantomData, ops::Deref, sync::Arc}; use encase::{private::WriteInto, ShaderType}; use wgpu::{Buffer, BufferAddress, BufferDescriptor, BufferUsages, Device, Queue}; @@ -8,8 +8,9 @@ use wgpu::{Buffer, BufferAddress, BufferDescriptor, BufferUsages, Device, Queue} use crate::util::typedefs::SsoString; /// Creates, fills, and automatically resizes a power-of-two sized buffer. +#[derive(Debug)] pub struct WrappedPotBuffer { - inner: Buffer, + inner: Arc, size: BufferAddress, // This field is assumed to be a power of 2. minimum: BufferAddress, @@ -30,12 +31,12 @@ where let usage = usage | BufferUsages::COPY_DST; Self { - inner: device.create_buffer(&BufferDescriptor { + inner: Arc::new(device.create_buffer(&BufferDescriptor { label: Some(label), size: minimum, usage, mapped_at_creation: false, - }), + })), size: minimum, minimum, usage, @@ -48,12 +49,12 @@ where let resize = resize_po2(self.size, desired, self.minimum); if let Some(size) = resize { self.size = size; - self.inner = device.create_buffer(&BufferDescriptor { + self.inner = Arc::new(device.create_buffer(&BufferDescriptor { label: Some(&self.label), size, usage: self.usage, mapped_at_creation: false, - }); + })); } } diff --git a/rend3/src/util/freelist/buffer.rs b/rend3/src/util/freelist/buffer.rs index efc695da..f9dd0b2e 100644 --- a/rend3/src/util/freelist/buffer.rs +++ b/rend3/src/util/freelist/buffer.rs @@ -5,11 +5,6 @@ use wgpu::{Buffer, BufferDescriptor, BufferUsages, CommandEncoder, Device}; use crate::util::scatter_copy::{ScatterCopy, ScatterData}; -const STARTING_SIZE: usize = 16; -const NEEDED_USAGES: BufferUsages = BufferUsages::STORAGE - .union(BufferUsages::COPY_DST) - .union(BufferUsages::COPY_SRC); - pub struct FreelistDerivedBuffer { inner: Buffer, @@ -21,6 +16,11 @@ pub struct FreelistDerivedBuffer { stale: Vec, } impl FreelistDerivedBuffer { + pub const STARTING_SIZE: usize = 16; + pub const NEEDED_USAGES: BufferUsages = BufferUsages::STORAGE + .union(BufferUsages::COPY_DST) + .union(BufferUsages::COPY_SRC); + pub fn new(device: &Device) -> Self where T: ShaderSize + WriteInto + 'static, @@ -29,16 +29,16 @@ impl FreelistDerivedBuffer { let buffer = device.create_buffer(&BufferDescriptor { label: Some("freelist buffer"), - size: rounded_size * STARTING_SIZE as u64, - usage: NEEDED_USAGES, + size: rounded_size * Self::STARTING_SIZE as u64, + usage: Self::NEEDED_USAGES, mapped_at_creation: false, }); Self { inner: buffer, - current_count: STARTING_SIZE, - reserved_count: STARTING_SIZE, + current_count: Self::STARTING_SIZE, + reserved_count: Self::STARTING_SIZE, rounded_size, stored_type: TypeId::of::(), @@ -70,7 +70,7 @@ impl FreelistDerivedBuffer { let new_buffer = device.create_buffer(&BufferDescriptor { label: Some("freelist buffer"), size: self.rounded_size * self.reserved_count as u64, - usage: NEEDED_USAGES, + usage: Self::NEEDED_USAGES, mapped_at_creation: false, }); diff --git a/rend3/src/util/math.rs b/rend3/src/util/math.rs index 484d7bcc..18638b86 100644 --- a/rend3/src/util/math.rs +++ b/rend3/src/util/math.rs @@ -2,13 +2,95 @@ use num_traits::PrimInt; -/// Rounds up `src` to the power of two `factor`. -pub fn round_up(src: T, factor: T) -> T { - let minus1 = factor - T::one(); - ((src + minus1) / factor) * factor +pub trait IntegerExt: PrimInt { + /// Rounds T away from zero to the nearest multiple of b. + /// + /// Panics if b is zero or negative. + fn round_up(self, b: Self) -> Self { + round_up(self, b) + } + + /// Performs integer division between a and b rounding away from zero, instead of towards it. + /// + /// Panics if b is zero or negative. + fn div_round_up(self, b: Self) -> Self { + div_round_up(self, b) + } +} + +impl IntegerExt for T {} + +/// Rounds T away from zero to the nearest multiple of b. +/// +/// Panics if b is zero or negative. +pub fn round_up(a: T, b: T) -> T { + assert!(b > T::zero(), "divisor must be non-zero and positive"); + // All the negative infrastructure will compile away if T is unsigned as this is unconditionally false + let negative = a < T::zero(); + + let pos_a = if negative { T::zero() - a } else { a }; + + let rem = pos_a % b; + if rem == T::zero() { + return a; + } + + let pos_res = pos_a + (b - rem); + + if negative { + T::zero() - pos_res + } else { + pos_res + } +} + +/// Performs integer division between a and b rounding away from zero, instead of towards it. +/// +/// Panics if b is zero or negative. +pub fn div_round_up(a: T, b: T) -> T { + assert!(b > T::zero(), "divisor must be non-zero and positive"); + // All the negative infrastructure will compile away if T is unsigned as this is unconditionally false + let negative = a < T::zero(); + + let pos_a = if negative { T::zero() - a } else { a }; + + let pos_res = (pos_a + (b - T::one())) / b; + + if negative { + T::zero() - pos_res + } else { + pos_res + } } -/// Performs integer division between a and b rounding up, instead of down -pub fn round_up_div(a: T, b: T) -> T { - (a + (b - T::one())) / b +#[cfg(test)] +mod tests { + #[test] + fn round_up() { + assert_eq!(super::round_up(2, 12), 12); + assert_eq!(super::round_up(12, 12), 12); + assert_eq!(super::round_up(0, 12), 0); + + // Negatives + assert_eq!(super::round_up(-14, 12), -24); + assert_eq!(super::round_up(-8, 12), -12); + + // Identity + assert_eq!(super::round_up(2, 1), 2); + } + + #[test] + fn round_up_div() { + assert_eq!(super::div_round_up(2, 12), 1); + assert_eq!(super::div_round_up(12, 12), 1); + assert_eq!(super::div_round_up(18, 12), 2); + assert_eq!(super::div_round_up(0, 12), 0); + + // Negatives + assert_eq!(super::div_round_up(-14, 12), -2); + assert_eq!(super::div_round_up(-8, 12), -1); + + // Identity + assert_eq!(super::div_round_up(2, 1), 2); + } } diff --git a/rend3/src/util/scatter_copy.rs b/rend3/src/util/scatter_copy.rs index 863bb076..006aa7bc 100644 --- a/rend3/src/util/scatter_copy.rs +++ b/rend3/src/util/scatter_copy.rs @@ -8,7 +8,7 @@ use wgpu::{ use crate::util::{ bind_merge::{BindGroupBuilder, BindGroupLayoutBuilder}, - math::round_up_div, + math::div_round_up, }; pub struct ScatterData { @@ -134,7 +134,7 @@ impl ScatterCopy { }); cpass.set_pipeline(&self.pipeline); cpass.set_bind_group(0, &bg, &[]); - cpass.dispatch_workgroups(round_up_div(count_u32, 64), 1, 1); + cpass.dispatch_workgroups(div_round_up(count_u32, 64), 1, 1); drop(cpass); } }