From 7d1e0125116ab62535413723f09769fc821faa9f Mon Sep 17 00:00:00 2001
From: Connor Fitzgerald <connorwadefitzgerald@gmail.com>
Date: Mon, 27 Nov 2023 02:24:23 -0500
Subject: [PATCH] Gpu culling (#489)

Co-authored-by: Connor Fitzgerald <connor@modyfi.io>
---
 .github/workflows/ci.yml                      |   2 +
 Cargo.toml                                    |   2 +-
 deny.toml                                     |   2 -
 examples/animation/src/lib.rs                 |  10 +-
 examples/cube-no-framework/src/main.rs        |  13 +-
 examples/cube/src/lib.rs                      |   8 +-
 examples/egui/src/main.rs                     |   8 +-
 examples/scene-viewer/src/lib.rs              |  48 +-
 examples/skinning/src/lib.rs                  |   8 +-
 examples/static-gltf/src/main.rs              |   8 +-
 examples/textured-quad/src/main.rs            |   8 +-
 rend3-framework/src/lib.rs                    |   1 +
 rend3-routine/Cargo.toml                      |   8 +-
 rend3-routine/shaders/src/cull.wgsl           | 396 +++++++++--
 rend3-routine/shaders/src/depth.wgsl          |  28 +-
 rend3-routine/shaders/src/hi_z.wgsl           |  33 +
 rend3-routine/shaders/src/opaque.wgsl         |  28 +-
 .../shaders/src/resolve_depth_min.wgsl        |  27 +
 rend3-routine/shaders/src/structures.wgsl     |  36 +-
 .../shaders/src/structures_object.wgsl        |  13 +-
 rend3-routine/shaders/src/uniform_prep.wgsl   |  27 +
 rend3-routine/src/base.rs                     | 224 ++++--
 rend3-routine/src/clear.rs                    |  28 +-
 rend3-routine/src/common/camera.rs            |  36 +
 rend3-routine/src/common/interfaces.rs        |   9 +
 rend3-routine/src/common/mod.rs               |   2 +
 rend3-routine/src/culling/batching.rs         | 116 ++-
 rend3-routine/src/culling/culler.rs           | 665 ++++++++++++++----
 rend3-routine/src/culling/mod.rs              |   6 +-
 rend3-routine/src/culling/suballoc.rs         | 227 ++++++
 rend3-routine/src/forward.rs                  | 128 +++-
 rend3-routine/src/hi_z.rs                     | 276 ++++++++
 rend3-routine/src/lib.rs                      |   1 +
 rend3-routine/src/pbr/routine.rs              |  14 +-
 rend3-routine/src/shaders.rs                  |  34 +-
 rend3-routine/src/skinning.rs                 |   4 +-
 rend3-test/Cargo.toml                         |   1 +
 rend3-test/src/helpers.rs                     |  13 +-
 rend3-test/src/lib.rs                         |   2 +
 rend3-test/src/runner.rs                      |  29 +-
 rend3-test/src/threshold.rs                   |  73 ++
 rend3-test/tests/msaa.rs                      |   6 +-
 rend3-test/tests/object.rs                    |  61 ++
 .../results/object/multi-frame-add-0.png      | Bin 0 -> 439 bytes
 .../results/object/multi-frame-add-1.png      | Bin 0 -> 435 bytes
 rend3-test/tests/results/shadow/cube.png      | Bin 5966 -> 5415 bytes
 rend3-test/tests/results/simple/empty.png     | Bin 0 -> 434 bytes
 rend3-test/tests/root.rs                      |   1 +
 rend3-test/tests/shadow.rs                    |  20 +-
 rend3-test/tests/simple.rs                    |  42 +-
 rend3-types/src/lib.rs                        |  29 +-
 rend3/shaders/vertex_attributes.wgsl          |  37 +-
 rend3/src/graph/data_handle.rs                |  37 -
 rend3/src/graph/graph.rs                      |   9 +
 rend3/src/graph/mod.rs                        |  53 +-
 rend3/src/graph/node.rs                       |  15 +
 rend3/src/graph/texture_store.rs              |   2 +-
 rend3/src/managers/graph_storage.rs           |  26 +-
 rend3/src/managers/handle_alloc.rs            |  37 +-
 rend3/src/managers/object.rs                  |  52 +-
 rend3/src/renderer/eval.rs                    |  11 +-
 rend3/src/renderer/mod.rs                     |  16 +-
 rend3/src/shader.rs                           |  28 +-
 rend3/src/util/bind_merge.rs                  |   9 +
 rend3/src/util/buffer.rs                      |  13 +-
 rend3/src/util/freelist/buffer.rs             |  20 +-
 rend3/src/util/math.rs                        |  96 ++-
 rend3/src/util/scatter_copy.rs                |   4 +-
 68 files changed, 2730 insertions(+), 496 deletions(-)
 create mode 100644 rend3-routine/shaders/src/hi_z.wgsl
 create mode 100644 rend3-routine/shaders/src/resolve_depth_min.wgsl
 create mode 100644 rend3-routine/shaders/src/uniform_prep.wgsl
 create mode 100644 rend3-routine/src/common/camera.rs
 create mode 100644 rend3-routine/src/culling/suballoc.rs
 create mode 100644 rend3-routine/src/hi_z.rs
 create mode 100644 rend3-test/src/threshold.rs
 create mode 100644 rend3-test/tests/object.rs
 create mode 100644 rend3-test/tests/results/object/multi-frame-add-0.png
 create mode 100644 rend3-test/tests/results/object/multi-frame-add-1.png
 create mode 100644 rend3-test/tests/results/simple/empty.png
 delete mode 100644 rend3/src/graph/data_handle.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8589f2eb..98d4572b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -79,6 +79,8 @@ jobs:
         if: matrix.target != 'wasm32-unknown-unknown'
       
       - uses: actions/upload-artifact@v3
+        # always run
+        if: ${{ !cancelled() }}
         with:
           name: comparison-images-${{ matrix.name }}
           path: |
diff --git a/Cargo.toml b/Cargo.toml
index 5fc6ff1a..ba1f3401 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,7 @@ members = [
     "rend3-gltf",
     "rend3-routine",
     "rend3-test",
-    "rend3-types"
+    "rend3-types",
 ]
 
 [profile.ci]
diff --git a/deny.toml b/deny.toml
index 446bd270..3c631336 100644
--- a/deny.toml
+++ b/deny.toml
@@ -13,8 +13,6 @@ license-files = [{ path = "COPYRIGHT", hash = 972598577 }]
 multiple-versions = "deny"
 wildcards = "allow"
 skip = [
-    # hashbrown
-    { name = "ahash", version = "0.7.6" },
     # gltf / reqwest
     { name = "base64", version = "0.13.1" },
     # ddsfile
diff --git a/examples/animation/src/lib.rs b/examples/animation/src/lib.rs
index 458ad3ba..2ddcfa72 100644
--- a/examples/animation/src/lib.rs
+++ b/examples/animation/src/lib.rs
@@ -2,7 +2,7 @@ use std::{path::Path, sync::Arc};
 
 use rend3::types::DirectionalLightHandle;
 
-const SAMPLE_COUNT: rend3::types::SampleCount = rend3::types::SampleCount::Four;
+const SAMPLE_COUNT: rend3::types::SampleCount = rend3::types::SampleCount::One;
 
 /// The application data, can only be obtained at `setup` time, so it's under an
 /// Option in the main struct.
@@ -164,8 +164,12 @@ impl rend3_framework::App for AnimationExample {
                 let mut graph = rend3::graph::RenderGraph::new();
 
                 // Import the surface texture into the render graph.
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph without a skybox
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/cube-no-framework/src/main.rs b/examples/cube-no-framework/src/main.rs
index fa04e20a..6f294959 100644
--- a/examples/cube-no-framework/src/main.rs
+++ b/examples/cube-no-framework/src/main.rs
@@ -1,3 +1,5 @@
+#![cfg_attr(target_arch = "wasm32", allow(clippy::arc_with_non_send_sync))]
+
 use std::sync::Arc;
 
 fn vertex(pos: [f32; 3]) -> glam::Vec3 {
@@ -107,8 +109,13 @@ fn main() {
     let base_rendergraph = rend3_routine::base::BaseRenderGraph::new(&renderer, &spp);
 
     let mut data_core = renderer.data_core.lock();
-    let pbr_routine =
-        rend3_routine::pbr::PbrRoutine::new(&renderer, &mut data_core, &spp, &base_rendergraph.interfaces);
+    let pbr_routine = rend3_routine::pbr::PbrRoutine::new(
+        &renderer,
+        &mut data_core,
+        &spp,
+        &base_rendergraph.interfaces,
+        &base_rendergraph.gpu_culler.culling_buffer_map_handle,
+    );
     drop(data_core);
     let tonemapping_routine = rend3_routine::tonemapping::TonemappingRoutine::new(
         &renderer,
@@ -209,7 +216,7 @@ fn main() {
 
             // Import the surface texture into the render graph.
             let frame_handle =
-                graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                graph.add_imported_render_target(&frame, 0..1, 0..1, rend3::graph::ViewportRect::from_size(resolution));
             // Add the default rendergraph without a skybox
             base_rendergraph.add_to_graph(
                 &mut graph,
diff --git a/examples/cube/src/lib.rs b/examples/cube/src/lib.rs
index 10c04bcc..14faa4d0 100644
--- a/examples/cube/src/lib.rs
+++ b/examples/cube/src/lib.rs
@@ -167,8 +167,12 @@ impl rend3_framework::App for CubeExample {
                 let mut graph = rend3::graph::RenderGraph::new();
 
                 // Import the surface texture into the render graph.
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph without a skybox
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/egui/src/main.rs b/examples/egui/src/main.rs
index 98c4d0c5..70b81753 100644
--- a/examples/egui/src/main.rs
+++ b/examples/egui/src/main.rs
@@ -216,8 +216,12 @@ impl rend3_framework::App for EguiExample {
                 let mut graph = rend3::graph::RenderGraph::new();
 
                 // Import the surface texture into the render graph.
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph without a skybox
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/scene-viewer/src/lib.rs b/examples/scene-viewer/src/lib.rs
index bb01576d..bd61d8bc 100644
--- a/examples/scene-viewer/src/lib.rs
+++ b/examples/scene-viewer/src/lib.rs
@@ -166,6 +166,22 @@ fn extract_vsync(value: &str) -> Result<rend3::types::PresentMode, &'static str>
     })
 }
 
+fn extract_array<const N: usize>(value: &str, default: [f32; N]) -> Result<[f32; N], &'static str> {
+    let mut res = default;
+    let split: Vec<_> = value.split(',').enumerate().collect();
+
+    if split.len() != N {
+        return Err("Mismatched argument count");
+    }
+
+    for (idx, inner) in split {
+        let inner = inner.trim();
+
+        res[idx] = inner.parse().map_err(|_| "Cannot parse argument number")?;
+    }
+    Ok(res)
+}
+
 fn extract_vec3(value: &str) -> Result<Vec3, &'static str> {
     let mut res = [0.0_f32, 0.0, 0.0];
     let split: Vec<_> = value.split(',').enumerate().collect();
@@ -254,6 +270,7 @@ Assets:
 Controls:
   --walk <speed>               Walk speed (speed without holding shift) in units/second (typically meters). Default 10.
   --run  <speed>               Run speed (speed while holding shift) in units/second (typically meters). Default 50.
+  --camera x,y,z,pitch,yaw     Spawns the camera at the given position. Press Period to get the current camera position.
 ";
 
 struct SceneViewer {
@@ -326,6 +343,10 @@ impl SceneViewer {
         // Controls
         let walk_speed = args.value_from_str("--walk").unwrap_or(10.0_f32);
         let run_speed = args.value_from_str("--run").unwrap_or(50.0_f32);
+        let camera_default = [3.0, 3.0, 3.0, -std::f32::consts::FRAC_PI_8, std::f32::consts::FRAC_PI_4];
+        let camera_info = args
+            .value_from_str("--camera")
+            .map_or(camera_default, |s: String| extract_array(&s, camera_default).unwrap());
 
         // Free args
         let file_to_load: Option<String> = args.free_from_str().ok();
@@ -382,9 +403,9 @@ impl SceneViewer {
             fullscreen,
 
             scancode_status: FastHashMap::default(),
-            camera_pitch: -std::f32::consts::FRAC_PI_8,
-            camera_yaw: std::f32::consts::FRAC_PI_4,
-            camera_location: Vec3A::new(3.0, 3.0, 3.0),
+            camera_pitch: camera_info[3],
+            camera_yaw: camera_info[4],
+            camera_location: Vec3A::new(camera_info[0], camera_info[1], camera_info[2]),
             previous_profiling_stats: None,
             timestamp_last_second: Instant::now(),
             timestamp_last_frame: Instant::now(),
@@ -524,6 +545,8 @@ impl rend3_framework::App for SceneViewer {
 
                 self.timestamp_last_frame = now;
 
+                // std::thread::sleep(Duration::from_millis(100));
+
                 let rotation =
                     Mat3A::from_euler(glam::EulerRot::XYZ, -self.camera_pitch, -self.camera_yaw, 0.0).transpose();
                 let forward = -rotation.z_axis;
@@ -549,8 +572,15 @@ impl rend3_framework::App for SceneViewer {
                 if button_pressed(&self.scancode_status, platform::Scancodes::Q) {
                     self.camera_location += up * velocity * delta_time.as_secs_f32();
                 }
-                if button_pressed(&self.scancode_status, platform::Scancodes::Z) {
-                    self.camera_location -= up * velocity * delta_time.as_secs_f32();
+                if button_pressed(&self.scancode_status, platform::Scancodes::PERIOD) {
+                    println!(
+                        "{x},{y},{z},{pitch},{yaw}",
+                        x = self.camera_location.x,
+                        y = self.camera_location.y,
+                        z = self.camera_location.z,
+                        pitch = self.camera_pitch,
+                        yaw = self.camera_yaw
+                    );
                 }
 
                 if button_pressed(&self.scancode_status, platform::Scancodes::ESCAPE) {
@@ -595,8 +625,12 @@ impl rend3_framework::App for SceneViewer {
                 // Build a rendergraph
                 let mut graph = rend3::graph::RenderGraph::new();
 
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/skinning/src/lib.rs b/examples/skinning/src/lib.rs
index 8cd322c2..ec7401b1 100644
--- a/examples/skinning/src/lib.rs
+++ b/examples/skinning/src/lib.rs
@@ -158,8 +158,12 @@ impl rend3_framework::App for SkinningExample {
                 // Build a rendergraph
                 let mut graph = rend3::graph::RenderGraph::new();
 
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph without a skybox
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/static-gltf/src/main.rs b/examples/static-gltf/src/main.rs
index e60b759f..3402f367 100644
--- a/examples/static-gltf/src/main.rs
+++ b/examples/static-gltf/src/main.rs
@@ -149,8 +149,12 @@ impl rend3_framework::App for GltfExample {
                 let mut graph = rend3::graph::RenderGraph::new();
 
                 // Import the surface texture into the render graph.
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph without a skybox
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/examples/textured-quad/src/main.rs b/examples/textured-quad/src/main.rs
index 26d29029..2545ed4f 100644
--- a/examples/textured-quad/src/main.rs
+++ b/examples/textured-quad/src/main.rs
@@ -174,8 +174,12 @@ impl rend3_framework::App for TexturedQuadExample {
                 let mut graph = rend3::graph::RenderGraph::new();
 
                 // Import the surface texture into the render graph.
-                let frame_handle =
-                    graph.add_imported_render_target(&frame, 0..1, rend3::graph::ViewportRect::from_size(resolution));
+                let frame_handle = graph.add_imported_render_target(
+                    &frame,
+                    0..1,
+                    0..1,
+                    rend3::graph::ViewportRect::from_size(resolution),
+                );
                 // Add the default rendergraph
                 base_rendergraph.add_to_graph(
                     &mut graph,
diff --git a/rend3-framework/src/lib.rs b/rend3-framework/src/lib.rs
index 12299c05..a52fec37 100644
--- a/rend3-framework/src/lib.rs
+++ b/rend3-framework/src/lib.rs
@@ -259,6 +259,7 @@ pub async fn async_start<A: App<T> + 'static, T: 'static>(mut app: A, window_bui
             &mut data_core,
             &spp,
             &base_rendergraph.interfaces,
+            &base_rendergraph.gpu_culler.culling_buffer_map_handle,
         )),
         skybox: Mutex::new(rend3_routine::skybox::SkyboxRoutine::new(
             &renderer,
diff --git a/rend3-routine/Cargo.toml b/rend3-routine/Cargo.toml
index dd624ad3..881834eb 100644
--- a/rend3-routine/Cargo.toml
+++ b/rend3-routine/Cargo.toml
@@ -15,20 +15,18 @@ rust-version = "1.71"
 arrayvec = "0.7"
 bitflags = "2"
 bytemuck = "1"
+codespan-reporting = "0.11"
 encase = { version = "0.6", features = ["glam"] }
 flume = "0.11"
 glam = { version = "0.24.0", features = ["bytemuck"] }
 log = "0.4"
+naga = { version = "0.14", features = ["wgsl-in"] }
 ordered-float = "4"
 parking_lot = "0.12"
 profiling = {version = "1", default-features = false }
 rend3 = { version = "^0.3.0", path = "../rend3" }
 rust-embed = { version = "8", features = ["interpolate-folder-path"] }
 serde = { version = "1", features = ["derive"] }
+serde_json = "1"
 wgpu = "0.18.0"
 wgpu-profiler = "0.15.0"
-
-[dev-dependencies]
-codespan-reporting = "0.11"
-naga = { version = "0.14", features = ["wgsl-in"] }
-serde_json = { version = "1" }
diff --git a/rend3-routine/shaders/src/cull.wgsl b/rend3-routine/shaders/src/cull.wgsl
index 0809ebb5..9dc7792a 100644
--- a/rend3-routine/shaders/src/cull.wgsl
+++ b/rend3-routine/shaders/src/cull.wgsl
@@ -5,72 +5,374 @@
 var<storage> vertex_buffer: array<u32>;
 @group(0) @binding(1)
 var<storage> object_buffer: array<Object>;
+
+fn vertex_fetch(
+    object_invocation: u32,
+    object_info: ptr<function, ObjectCullingInformation>,
+) -> Triangle {
+    let index_0_index = object_invocation * 3u + 0u;
+    let index_1_index = object_invocation * 3u + 1u;
+    let index_2_index = object_invocation * 3u + 2u;
+
+    let object = object_buffer[(*object_info).object_id];
+
+    let index0 = vertex_buffer[object.first_index + index_0_index];
+    let index1 = vertex_buffer[object.first_index + index_1_index];
+    let index2 = vertex_buffer[object.first_index + index_2_index];
+
+    let position_start_offset = object.vertex_attribute_start_offsets[{{position_attribute_offset}}];
+    let model_position0 = extract_attribute_vec3_f32(position_start_offset, index0);
+    let model_position1 = extract_attribute_vec3_f32(position_start_offset, index1);
+    let model_position2 = extract_attribute_vec3_f32(position_start_offset, index2);
+
+    return Triangle(
+        TriangleVertices(model_position0, model_position1, model_position2),
+        TriangleIndices(index0, index1, index2)
+    );
+}
+
 @group(0) @binding(2)
 var<storage> culling_job: BatchData;
+
+struct DrawCallBuffer {
+    /// We always put the buffer that needs to be present in the next frame first.
+    predicted_object_offset: u32,
+    residual_object_offset: u32,
+    calls: array<IndirectCall>,
+}
+
 @group(0) @binding(3)
-var<storage, read_write> output_buffer: array<u32>;
+var<storage, read_write> draw_calls: DrawCallBuffer;
+
+fn init_draw_calls(global_invocation: u32, region_id: u32) {
+    // Init the inheritable draw call
+    let predicted_object_draw_index = draw_calls.predicted_object_offset + region_id;
+    draw_calls.calls[predicted_object_draw_index].vertex_offset = 0;
+    draw_calls.calls[predicted_object_draw_index].instance_count = 1u;
+    draw_calls.calls[predicted_object_draw_index].base_instance = 0u;
+    draw_calls.calls[predicted_object_draw_index].base_index = global_invocation * 3u;
+
+    // Init the residual objects draw call
+    let residual_object_draw_index = draw_calls.residual_object_offset + region_id;
+    draw_calls.calls[residual_object_draw_index].vertex_offset = 0;
+    draw_calls.calls[residual_object_draw_index].instance_count = 1u;
+    draw_calls.calls[residual_object_draw_index].base_instance = 0u;
+    draw_calls.calls[residual_object_draw_index].base_index = global_invocation * 3u;
+}
+
+fn add_predicted_triangle_to_draw_call(region_id: u32) -> u32 {
+    let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.predicted_object_offset + region_id].vertex_count, 3u);
+    let output_region_triangle = output_region_index / 3u;
+    return output_region_triangle;
+}
+
+fn add_residual_triangle_to_draw_call(region_id: u32) -> u32 {
+    let output_region_index = atomicAdd(&draw_calls.calls[draw_calls.residual_object_offset + region_id].vertex_count, 3u);
+    let output_region_triangle = output_region_index / 3u;
+    return output_region_triangle;
+}
+
+struct OutputIndexBuffer {
+    /// We always put the buffer that needs to be present in the next frame first.
+    predicted_object_offset: u32,
+    residual_object_offset: u32,
+    indices: array<u32>,
+}
+@group(0) @binding(4)
+var<storage, read_write> output_indices : OutputIndexBuffer;
+
+fn write_predicted_atomic_triangle(
+    batch_object_index: u32,
+    object_info: ptr<function, ObjectCullingInformation>,
+    indices: TriangleIndices,
+) {
+    let region_invocation = add_predicted_triangle_to_draw_call((*object_info).region_id);
+    let batch_invocation = region_invocation + (*object_info).region_base_invocation;
+    let global_invocation = batch_invocation + culling_job.batch_base_invocation;
+
+    let packed_indices = pack_batch_indices(batch_object_index, indices);
+
+    let predicted_object_indices_index = output_indices.predicted_object_offset + global_invocation * 3u;
+    output_indices.indices[predicted_object_indices_index] = packed_indices[0];
+    output_indices.indices[predicted_object_indices_index + 1u] = packed_indices[1];
+    output_indices.indices[predicted_object_indices_index + 2u] = packed_indices[2];
+}
+
+fn write_residual_atomic_triangle(
+    batch_object_index: u32,
+    object_info: ptr<function, ObjectCullingInformation>,
+    indices: TriangleIndices,
+) {
+    let region_invocation = add_residual_triangle_to_draw_call((*object_info).region_id);
+    let batch_invocation = region_invocation + (*object_info).region_base_invocation;
+    let global_invocation = batch_invocation + culling_job.batch_base_invocation;
 
-struct ObjectRangeIndex {
-    range: ObjectRange,
-    index: u32,
+    let packed_indices = pack_batch_indices(batch_object_index, indices);
+
+    let residual_object_indices_index = output_indices.residual_object_offset + global_invocation * 3u;
+    output_indices.indices[residual_object_indices_index] = packed_indices[0];
+    output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1];
+    output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2];
+}
+
+fn write_residual_nonatomic_triangle(
+    invocation: u32,
+    batch_object_index: u32,
+    object_info: ptr<function, ObjectCullingInformation>,
+    indices: TriangleIndices,
+) {
+    add_residual_triangle_to_draw_call((*object_info).region_id);
+
+    let packed_indices = pack_batch_indices(batch_object_index, indices);
+
+    let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u;
+    output_indices.indices[residual_object_indices_index] = packed_indices[0];
+    output_indices.indices[residual_object_indices_index + 1u] = packed_indices[1];
+    output_indices.indices[residual_object_indices_index + 2u] = packed_indices[2];
+}
+
+fn write_invalid_residual_nonatomic_triangle(invocation: u32, object_info: ptr<function, ObjectCullingInformation>) {
+    add_residual_triangle_to_draw_call((*object_info).region_id);
+
+    let residual_object_indices_index = output_indices.residual_object_offset + invocation * 3u;
+    output_indices.indices[residual_object_indices_index] = INVALID_VERTEX;
+    output_indices.indices[residual_object_indices_index + 1u] = INVALID_VERTEX;
+    output_indices.indices[residual_object_indices_index + 2u] = INVALID_VERTEX;
+}
+
+struct CullingResults {
+    /// We always put the buffer that needs to be present in the next frame first.
+    output_offset: u32,
+    input_offset: u32,
+    bits: array<u32>,
+}
+@group(0) @binding(5)
+var<storage, read_write> culling_results: CullingResults;
+
+fn get_previous_culling_result(object_info: ptr<function, ObjectCullingInformation>, object_invocation: u32) -> bool {
+    if (*object_info).previous_global_invocation == 0xFFFFFFFFu {
+        return false;
+    }
+
+    let previous_global_invocation = object_invocation + (*object_info).previous_global_invocation;
+    let bitmask = culling_results.bits[culling_results.input_offset + (previous_global_invocation / 32u)];
+    return ((bitmask >> (previous_global_invocation % 32u)) & 0x1u) == 0x1u;
 }
 
-var<workgroup> workgroup_object_range: ObjectRangeIndex;
+@group(0) @binding(6)
+var<storage> per_camera_uniform: PerCameraUniform;
+
+fn is_shadow_pass() -> bool {
+    return per_camera_uniform.shadow_index != 0xFFFFFFFFu;
+}
+
+@group(0) @binding(7)
+var hirearchical_z_buffer: texture_depth_2d;
+@group(0) @binding(8)
+var nearest_sampler: sampler;
+
+{{include "rend3/vertex_attributes.wgsl"}}
+
+struct ObjectSearchResult {
+    range: ObjectCullingInformation,
+    index_within_region: u32,
+}
+
+fn find_object_info(wid: u32) -> ObjectSearchResult {
+    let target_invocation = wid * 64u;
+    // pulled directly from https://doc.rust-lang.org/src/core/slice/mod.rs.html#2412-2438
+
+    var size = culling_job.total_objects;
+    var left = 0u;
+    var right = size;
+    var object_info: ObjectCullingInformation;
+    while left < right {
+        let mid = left + size / 2u;
+
+        let probe = culling_job.object_culling_information[mid];
+
+        if probe.invocation_end <= target_invocation {
+            left = mid + 1u;
+        } else if probe.invocation_start > target_invocation {
+            right = mid;
+        } else {
+            return ObjectSearchResult(probe, mid);
+        }
+
+        size = right - left;
+    }
+    
+    // This is unreachable, but required for the compiler to be happy
+    return ObjectSearchResult(object_info, 0xFFFFFFFFu);
+}
 
-@compute @workgroup_size(256)
+// 64 workgroup size / 32 bits
+var<workgroup> workgroup_culling_results: array<atomic<u32>, 2>;
+
+fn clear_culling_results(lid: u32) {
+    if lid == 0u {
+        atomicStore(&workgroup_culling_results[0], 0u);
+        atomicStore(&workgroup_culling_results[1], 0u);
+    }
+}
+
+fn compute_culling_results(lid: u32, passed_culling: bool) {
+    atomicOr(&workgroup_culling_results[lid / 32u], u32(passed_culling) << (lid % 32u));
+}
+
+fn save_culling_results(global_invocation: u32, lid: u32) {
+    if lid == 0u {
+        let culling_results_index = culling_results.output_offset + (global_invocation / 32u);
+        culling_results.bits[culling_results_index + 0u] = atomicLoad(&workgroup_culling_results[0]);
+        culling_results.bits[culling_results_index + 1u] = atomicLoad(&workgroup_culling_results[1]);
+    }
+}
+
+fn textureSampleMin(texture: texture_depth_2d, uv: vec2<f32>, mipmap: f32) -> f32 {
+    let int_mipmap = i32(mipmap);
+    let mip_resolution = vec2<f32>(textureDimensions(texture, int_mipmap).xy);
+
+    let pixel_coords = uv * mip_resolution - 0.5;
+
+    let low = vec2<u32>(max(floor(pixel_coords), vec2<f32>(0.0)));
+    let high = vec2<u32>(min(ceil(pixel_coords), mip_resolution - 1.0));
+
+    let top_left = vec2<u32>(low.x, low.y);
+    let top_right = vec2<u32>(high.x, low.y);
+    let bottom_left = vec2<u32>(low.x, high.y);
+    let bottom_right = vec2<u32>(high.x, high.y);
+
+    var minval = textureLoad(texture, top_left, int_mipmap);
+    minval = min(minval, textureLoad(texture, top_right, int_mipmap));
+    minval = min(minval, textureLoad(texture, bottom_left, int_mipmap));
+    minval = min(minval, textureLoad(texture, bottom_right, int_mipmap));
+    return minval;
+}
+
+fn execute_culling(
+    model_view_proj: mat4x4<f32>,
+    vertices: TriangleVertices,
+) -> bool {
+    let position0 = model_view_proj * vec4<f32>(vertices[0], 1.0);
+    let position1 = model_view_proj * vec4<f32>(vertices[1], 1.0);
+    let position2 = model_view_proj * vec4<f32>(vertices[2], 1.0);
+
+    let det = determinant(mat3x3<f32>(position0.xyw, position1.xyw, position2.xyw));
+
+    if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_POSITIVE_AREA_VISIBLE && det <= 0.0 {
+        return false;
+    }
+    if (per_camera_uniform.flags & PCU_FLAGS_AREA_VISIBLE_MASK) == PCU_FLAGS_NEGATIVE_AREA_VISIBLE && det >= 0.0 {
+        return false;
+    }
+
+    let ndc0 = position0.xyz / position0.w;
+    let ndc1 = position1.xyz / position1.w;
+    let ndc2 = position2.xyz / position2.w;
+
+    let min_ndc_xy = min(ndc0.xy, min(ndc1.xy, ndc2.xy));
+    let max_ndc_xy = max(ndc0.xy, max(ndc1.xy, ndc2.xy));
+
+    let half_res = per_camera_uniform.resolution / 2.0;
+    let min_screen_xy = (min_ndc_xy + 1.0) * half_res;
+    let max_screen_xy = (max_ndc_xy + 1.0) * half_res;
+
+    if (per_camera_uniform.flags & PCU_FLAGS_MULTISAMPLE_MASK) == PCU_FLAGS_MULTISAMPLE_DISABLED {
+        let misses_pixel_center = any(round(min_screen_xy) == round(max_screen_xy));
+
+        if misses_pixel_center {
+            return false;
+        }
+    }
+
+    // We skip hi-z calculation if we're doing a shadow pass
+    if per_camera_uniform.shadow_index != 0xFFFFFFFFu {
+        return true;
+    }
+
+    var min_tex_coords = (min_ndc_xy + 1.0) / 2.0;
+    var max_tex_coords = (max_ndc_xy + 1.0) / 2.0;
+    min_tex_coords.y = 1.0 - min_tex_coords.y;
+    max_tex_coords.y = 1.0 - max_tex_coords.y;
+
+    let uv = (max_tex_coords + min_tex_coords) / 2.0;
+    let edges = max_screen_xy - min_screen_xy;
+
+    let longest_edge = max(edges.x, edges.y);
+    let mip = ceil(log2(max(longest_edge, 1.0)));
+
+    let depth = max(max(ndc0.z, ndc1.z), ndc2.z);
+    let occlusion_depth = textureSampleMin(hirearchical_z_buffer, uv, mip);
+
+    if depth < occlusion_depth {
+        return false;
+    }
+
+    return true;
+}
+
+@compute @workgroup_size(64)
 fn cs_main(
     @builtin(workgroup_id) wid: vec3<u32>,
     @builtin(global_invocation_id) gid: vec3<u32>,
     @builtin(local_invocation_id) lid: vec3<u32>,
 ) {
-    if (lid.x == 0u) {
-        let target_invocation = wid.x * 256u;
-        // pulled directly from https://doc.rust-lang.org/src/core/slice/mod.rs.html#2412-2438
-
-        var size = culling_job.total_objects;
-        var left = 0u;
-        var right = size;
-        while left < right {
-            let mid = left + size / 2u;
-
-            let probe = culling_job.ranges[mid];
-
-            if probe.invocation_end <= target_invocation {
-                left = mid + 1u;
-            } else if probe.invocation_start > target_invocation {
-                right = mid;
-            } else {
-                workgroup_object_range = ObjectRangeIndex(probe, mid);
-                break;
-            }
+    clear_culling_results(lid.x);
+
+    let object_search_results = find_object_info(wid.x);
+    var object_info = object_search_results.range;
+    let batch_object_index = object_search_results.index_within_region;
+    let global_invocation = culling_job.batch_base_invocation + gid.x;
 
-            size = right - left;
+    // We need the workgroupBarrier to be in uniform control flow, so we can't return early here.
+    //
+    // If this is true, continue working on the other side of the barrier.
+    var write_culling_output = true;
+    if gid.x >= object_info.invocation_end {
+        if object_info.atomic_capable == 0u {
+            write_invalid_residual_nonatomic_triangle(global_invocation, &object_info);
         }
-    }
+        write_culling_output = false;
+    } else {
+        let object_invocation = gid.x - object_info.invocation_start;
 
-    workgroupBarrier();
+        // If the first invocation in the region, set the region's draw call
+        if object_info.local_region_id == 0u && object_invocation == 0u {
+            init_draw_calls(global_invocation, object_info.region_id);
+        }
 
-    let object_range = workgroup_object_range.range;
-    let local_object_index = workgroup_object_range.index;
+        let triangle = vertex_fetch(object_invocation, &object_info);
 
-    if (gid.x >= object_range.invocation_end) {
-        output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 0u] = 0x00FFFFFFu;
-        output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 1u] = 0x00FFFFFFu;
-        output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 2u] = 0x00FFFFFFu;
-        return;
-    }
+        let model_view_proj = per_camera_uniform.objects[object_info.object_id].model_view_proj;
 
-    let index_0_index = (gid.x - object_range.invocation_start) * 3u + 0u;
-    let index_1_index = (gid.x - object_range.invocation_start) * 3u + 1u;
-    let index_2_index = (gid.x - object_range.invocation_start) * 3u + 2u;
+        let passes_culling = execute_culling(model_view_proj, triangle.vertices);
 
-    let object = object_buffer[object_range.object_id];
+        if object_info.atomic_capable == 1u {
+            if passes_culling {
+                write_predicted_atomic_triangle(batch_object_index, &object_info, triangle.indices);
 
-    let index0 = vertex_buffer[object.first_index + index_0_index];
-    let index1 = vertex_buffer[object.first_index + index_1_index];
-    let index2 = vertex_buffer[object.first_index + index_2_index];
+                if !is_shadow_pass() {
+                    let previously_passed_culling = get_previous_culling_result(&object_info, object_invocation);
+
+                    if !previously_passed_culling {
+                        write_residual_atomic_triangle(batch_object_index, &object_info, triangle.indices);
+                    }
+                }
+            }
+        } else {
+            if passes_culling {
+                write_residual_nonatomic_triangle(global_invocation, batch_object_index, &object_info, triangle.indices);
+            } else {
+                write_invalid_residual_nonatomic_triangle(global_invocation, &object_info);
+            }
+        }
+
+        compute_culling_results(lid.x, passes_culling);
+    }
 
-    output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 0u] = (local_object_index << 24u) | (index0 & ((1u << 24u) - 1u));
-    output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 1u] = (local_object_index << 24u) | (index1 & ((1u << 24u) - 1u));
-    output_buffer[(culling_job.base_output_invocation + gid.x) * 3u + 2u] = (local_object_index << 24u) | (index2 & ((1u << 24u) - 1u));
+    workgroupBarrier();
+
+    if write_culling_output {
+        save_culling_results(global_invocation, lid.x);
+    }
 }
diff --git a/rend3-routine/shaders/src/depth.wgsl b/rend3-routine/shaders/src/depth.wgsl
index 54984865..9705f756 100644
--- a/rend3-routine/shaders/src/depth.wgsl
+++ b/rend3-routine/shaders/src/depth.wgsl
@@ -15,16 +15,18 @@ var<storage> object_buffer: array<Object>;
 var<storage> batch_data: BatchData;
 @group(1) @binding(2)
 var<storage> vertex_buffer: array<u32>;
+@group(1) @binding(3)
+var<storage> per_camera_uniform: PerCameraUniform;
 
 {{#if (eq profile "GpuDriven")}}
-@group(1) @binding(3)
+@group(1) @binding(4)
 var<storage> materials: array<GpuMaterialData>;
 @group(2) @binding(0)
 var textures: binding_array<texture_2d<f32>>;
 {{/if}}
 
 {{#if (eq profile "CpuDriven")}}
-@group(1) @binding(3)
+@group(1) @binding(4)
 var<storage> materials: array<CpuMaterialData>;
 @group(2) @binding(0)
 var albedo_tex: texture_2d<f32>;
@@ -49,19 +51,31 @@ struct VertexOutput {
 }
 
 @vertex
-fn vs_main(@builtin(instance_index) shadow_number: u32, @builtin(vertex_index) vertex_index: u32) -> VertexOutput {
-    if vertex_index == 0x00FFFFFFu {
+fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput {
+    // If the vertex index is our sentinel invalid value, return a degenerate triangle.
+    //
+    // This is used by the culling shader to discard triangles when the ordering of the
+    // triangles are important, and atomics can't be used.
+    if vertex_index == INVALID_VERTEX {
         var vs_out: VertexOutput;
         vs_out.position = vec4<f32>(0.0);
         return vs_out;
     }
     let indices = unpack_vertex_index(vertex_index);
+    
+    let data = object_buffer[indices.object];
+    // If the object is disabled, return a degenerate triangle.
+    //
+    // This happens when the object is deleted, and we're rendering last-frame's objects.
+    if data.enabled == 0u {
+        var vs_out: VertexOutput;
+        vs_out.position = vec4<f32>(0.0);
+        return vs_out;
+    }
 
     let vs_in = get_vertices(indices);
-    let data = object_buffer[indices.object];
 
-    // TODO: Store these in uniforms
-    let model_view_proj = directional_lights.data[shadow_number].view_proj * data.transform;
+    let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj;
 
     let position_vec4 = vec4<f32>(vs_in.position, 1.0);
 
diff --git a/rend3-routine/shaders/src/hi_z.wgsl b/rend3-routine/shaders/src/hi_z.wgsl
new file mode 100644
index 00000000..ecd1e176
--- /dev/null
+++ b/rend3-routine/shaders/src/hi_z.wgsl
@@ -0,0 +1,33 @@
+@group(0) @binding(0)
+var source: texture_depth_2d;
+
+struct VertexOutput {
+    @builtin(position) position: vec4<f32>,
+    @location(0) @interpolate(flat) resolution: vec2<u32>,
+}
+
+@vertex
+fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput {
+    let resolution = vec2<u32>(textureDimensions(source));
+    var output: VertexOutput;
+    output.position = vec4<f32>(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0);
+    output.resolution = resolution;
+    return output;
+}
+
+@fragment
+fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 {
+    let this_tex_coord = vec2<u32>(vout.position.xy);
+    let previous_base_tex_coord = 2u * this_tex_coord;
+
+    let count_odd = vout.resolution & vec2<u32>(1u);
+
+    var nearest = 1.0;
+    for (var x = 0u; x < 2u + count_odd.x; x += 1u) {
+        for (var y = 0u; y < 2u + count_odd.y; y += 1u) {
+            nearest = min(nearest, textureLoad(source, previous_base_tex_coord + vec2<u32>(x, y), 0));
+        }
+    }
+
+    return nearest;
+}
\ No newline at end of file
diff --git a/rend3-routine/shaders/src/opaque.wgsl b/rend3-routine/shaders/src/opaque.wgsl
index 739eb6f1..3d805515 100644
--- a/rend3-routine/shaders/src/opaque.wgsl
+++ b/rend3-routine/shaders/src/opaque.wgsl
@@ -25,16 +25,18 @@ var<storage> object_buffer: array<Object>;
 var<storage> batch_data: BatchData;
 @group(1) @binding(2)
 var<storage> vertex_buffer: array<u32>;
+@group(1) @binding(3)
+var<storage> per_camera_uniform: PerCameraUniform;
 
 {{#if (eq profile "GpuDriven")}}
-@group(1) @binding(3)
+@group(1) @binding(4)
 var<storage> materials: array<GpuMaterialData>;
 @group(2) @binding(0)
 var textures: binding_array<texture_2d<f32>>;
 {{/if}}
 
 {{#if (eq profile "CpuDriven")}}
-@group(1) @binding(3)
+@group(1) @binding(4)
 var<storage> materials: array<CpuMaterialData>;
 @group(2) @binding(0)
 var albedo_tex: texture_2d<f32>;
@@ -86,19 +88,31 @@ struct VertexOutput {
 
 @vertex
 fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput {
-    if vertex_index == 0x00FFFFFFu {
+    // If the vertex index is our sentinel invalid value, return a degenerate triangle.
+    //
+    // This is used by the culling shader to discard triangles when the ordering of the
+    // triangles are important, and atomics can't be used.
+    if vertex_index == INVALID_VERTEX {
         var vs_out: VertexOutput;
         vs_out.position = vec4<f32>(0.0);
         return vs_out;
     }
     let indices = unpack_vertex_index(vertex_index);
+    
+    let data = object_buffer[indices.object];
+    // If the object is disabled, return a degenerate triangle.
+    //
+    // This happens when the object is deleted, and we're rendering last-frame's objects.
+    if data.enabled == 0u {
+        var vs_out: VertexOutput;
+        vs_out.position = vec4<f32>(0.0);
+        return vs_out;
+    }
 
     let vs_in = get_vertices(indices);
-    let data = object_buffer[indices.object];
 
-    // TODO: Store these in uniforms
-    let model_view = uniforms.view * data.transform;
-    let model_view_proj = uniforms.view_proj * data.transform;
+    let model_view = per_camera_uniform.objects[indices.object].model_view;
+    let model_view_proj = per_camera_uniform.objects[indices.object].model_view_proj;
 
     let position_vec4 = vec4<f32>(vs_in.position, 1.0);
     let mv_mat3 = mat3x3<f32>(model_view[0].xyz, model_view[1].xyz, model_view[2].xyz);
diff --git a/rend3-routine/shaders/src/resolve_depth_min.wgsl b/rend3-routine/shaders/src/resolve_depth_min.wgsl
new file mode 100644
index 00000000..677ef722
--- /dev/null
+++ b/rend3-routine/shaders/src/resolve_depth_min.wgsl
@@ -0,0 +1,27 @@
+@group(0) @binding(0)
+var source: texture_depth_multisampled_2d;
+
+const SAMPLES: i32 = {{SAMPLES}};
+
+struct VertexOutput {
+    @builtin(position) position: vec4<f32>,
+}
+
+@vertex
+fn vs_main(@builtin(vertex_index) id: u32) -> VertexOutput {
+    let resolution = vec2<u32>(textureDimensions(source));
+    var output: VertexOutput;
+    output.position = vec4<f32>(f32(id / 2u) * 4.0 - 1.0, f32(id % 2u) * 4.0 - 1.0, 0.0, 1.0);
+    return output;
+}
+
+@fragment
+fn fs_main(vout: VertexOutput) -> @builtin(frag_depth) f32 {
+    var nearest: f32 = 1.0;
+
+    for (var sample = 0; sample < SAMPLES; sample += 1) {
+        nearest = min(nearest, textureLoad(source, vec2u(vout.position.xy), sample));
+    }
+
+    return nearest;
+}
diff --git a/rend3-routine/shaders/src/structures.wgsl b/rend3-routine/shaders/src/structures.wgsl
index f01f2cb4..0d85bda4 100644
--- a/rend3-routine/shaders/src/structures.wgsl
+++ b/rend3-routine/shaders/src/structures.wgsl
@@ -18,7 +18,7 @@ struct ObjectOutputData {
 }
 
 struct IndirectCall {
-    vertex_count: u32,
+    vertex_count: atomic<u32>,
     instance_count: u32,
     base_index: u32,
     vertex_offset: i32,
@@ -37,6 +37,40 @@ struct UniformData {
     resolution: vec2<u32>,
 }
 
+struct PerCameraUniformObjectData {
+    // TODO: use less space
+    model_view: mat4x4<f32>,
+    // TODO: use less space
+    model_view_proj: mat4x4<f32>,
+}
+
+struct PerCameraUniform {
+    // TODO: use less space
+    view: mat4x4<f32>,
+    // TODO: use less space
+    view_proj: mat4x4<f32>,
+    // The index of which shadow caster we are rendering for.
+    //
+    // This will be u32::MAX if we're rendering for a camera, not a shadow map.
+    shadow_index: u32,
+    frustum: Frustum,
+    resolution: vec2<f32>,
+    // Uses PCU_FLAGS_* constants
+    flags: u32,
+    object_count: u32,
+    objects: array<PerCameraUniformObjectData>,
+}
+
+// Area visible
+const PCU_FLAGS_AREA_VISIBLE_MASK: u32 = 0x1u;
+const PCU_FLAGS_NEGATIVE_AREA_VISIBLE: u32 = 0x0u;
+const PCU_FLAGS_POSITIVE_AREA_VISIBLE: u32 = 0x1u;
+
+// Multisampled
+const PCU_FLAGS_MULTISAMPLE_MASK: u32 = 0x2u;
+const PCU_FLAGS_MULTISAMPLE_DISABLED: u32 = 0x0u;
+const PCU_FLAGS_MULTISAMPLE_ENABLED: u32 = 0x2u;
+
 struct DirectionalLight {
     /// View/Projection of directional light. Shadow rendering uses viewports
     /// so this always outputs [-1, 1] no matter where in the atlast the shadow is.
diff --git a/rend3-routine/shaders/src/structures_object.wgsl b/rend3-routine/shaders/src/structures_object.wgsl
index 19d00fc4..2bb5b32c 100644
--- a/rend3-routine/shaders/src/structures_object.wgsl
+++ b/rend3-routine/shaders/src/structures_object.wgsl
@@ -7,17 +7,24 @@ struct Object {
     index_count: u32,
     material_index: u32,
     vertex_attribute_start_offsets: array<u32, {{vertex_array_counts}}>,
+    // 1 if enabled, 0 if disabled
+    enabled: u32,
 }
 
-struct ObjectRange {
+struct ObjectCullingInformation {
     invocation_start: u32,
     invocation_end: u32,
     object_id: u32,
+    region_id: u32,
+    region_base_invocation: u32,
+    local_region_id: u32,
+    previous_global_invocation: u32,
+    atomic_capable: u32,
 }
 
 struct BatchData {
-    ranges: array<ObjectRange, 256>,
     total_objects: u32,
     total_invocations: u32,
-    base_output_invocation: u32,
+    batch_base_invocation: u32,
+    object_culling_information: array<ObjectCullingInformation, 256>,
 }
diff --git a/rend3-routine/shaders/src/uniform_prep.wgsl b/rend3-routine/shaders/src/uniform_prep.wgsl
new file mode 100644
index 00000000..7cc2ae8f
--- /dev/null
+++ b/rend3-routine/shaders/src/uniform_prep.wgsl
@@ -0,0 +1,27 @@
+{{include "rend3-routine/structures.wgsl"}}
+{{include "rend3-routine/structures_object.wgsl"}}
+
+@group(0) @binding(0)
+var<storage> object_buffer: array<Object>;
+@group(0) @binding(1)
+var<storage, read_write> per_camera_uniform: PerCameraUniform;
+
+@compute @workgroup_size(64)
+fn cs_main(
+    @builtin(global_invocation_id) gid: vec3<u32>,
+) {
+    let idx = gid.x;
+
+    if idx >= per_camera_uniform.object_count {
+        return;
+    }
+    if object_buffer[idx].enabled == 0u {
+        return;
+    }
+
+    let model_view = per_camera_uniform.view * object_buffer[idx].transform;
+    let model_view_proj = per_camera_uniform.view_proj * object_buffer[idx].transform;
+
+    per_camera_uniform.objects[idx].model_view = model_view;
+    per_camera_uniform.objects[idx].model_view_proj = model_view_proj;
+}
diff --git a/rend3-routine/src/base.rs b/rend3-routine/src/base.rs
index aac4755d..6252db3c 100644
--- a/rend3-routine/src/base.rs
+++ b/rend3-routine/src/base.rs
@@ -27,7 +27,53 @@ use rend3::{
 };
 use wgpu::{BindGroup, Buffer};
 
-use crate::{common, culling, forward::RoutineAddToGraphArgs, pbr, skinning, skybox, tonemapping};
+use crate::{
+    common::{self, CameraIndex},
+    culling,
+    forward::RoutineAddToGraphArgs,
+    pbr, skinning, skybox, tonemapping,
+};
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct DepthTargets {
+    pub single_sample_mipped: RenderTargetHandle,
+    pub multi_sample: Option<RenderTargetHandle>,
+}
+
+impl DepthTargets {
+    pub fn new(graph: &mut RenderGraph<'_>, resolution: UVec2, samples: SampleCount) -> Self {
+        let single_sample_mipped = graph.add_render_target(RenderTargetDescriptor {
+            label: Some("hdr depth".into()),
+            resolution,
+            depth: 1,
+            mip_levels: None,
+            samples: SampleCount::One,
+            format: TextureFormat::Depth32Float,
+            usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING,
+        });
+
+        let multi_sample = samples.needs_resolve().then(|| {
+            graph.add_render_target(RenderTargetDescriptor {
+                label: Some("hdr depth multisampled".into()),
+                resolution,
+                depth: 1,
+                mip_levels: Some(1),
+                samples,
+                format: TextureFormat::Depth32Float,
+                usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING,
+            })
+        });
+
+        Self {
+            single_sample_mipped,
+            multi_sample,
+        }
+    }
+
+    pub fn rendering_target(&self) -> RenderTargetHandle {
+        self.multi_sample.unwrap_or(self.single_sample_mipped.set_mips(0..1))
+    }
+}
 
 /// Starter RenderGraph.
 ///
@@ -76,35 +122,58 @@ impl BaseRenderGraph {
         ambient: Vec4,
         clear_color: Vec4,
     ) {
-        // Create intermediate storage
+        // Create the data and handles for the graph.
         let state = BaseRenderGraphIntermediateState::new(graph, eval_output, resolution, samples);
 
-        // Preparing and uploading data
+        // Clear the shadow map.
+        state.clear_shadow(graph);
+
+        // Prepare all the uniforms that all shaders need access to.
         state.create_frame_uniforms(graph, self, ambient, resolution);
 
-        // Skinning
+        // Perform compute based skinning.
         state.skinning(graph, self);
 
-        // Culling
+        // Upload the uniforms for the objects in the shadow pass.
+        state.shadow_object_uniform_upload(graph, self, eval_output);
+        // Perform culling for the objects in the shadow pass.
         state.pbr_shadow_culling(graph, self);
-        state.pbr_culling(graph, self);
 
-        // Depth-only rendering
+        // Render all the shadows to the shadow map.
         state.pbr_shadow_rendering(graph, pbr, &eval_output.shadows);
 
-        // Clear targets
+        // Clear the primary render target and depth target.
         state.clear(graph, clear_color);
 
-        // Forward rendering opaque
-        state.pbr_forward_rendering_opaque(graph, pbr, samples);
+        // Upload the uniforms for the objects in the forward pass.
+        state.object_uniform_upload(graph, self, resolution, samples);
+
+        // Do the first pass, rendering the predicted triangles from last frame.
+        state.pbr_render_opaque_predicted_triangles(graph, pbr, samples);
 
-        // Skybox
+        // Create the hi-z buffer.
+        state.hi_z(graph, pbr, resolution);
+
+        // Perform culling for the objects in the forward pass.
+        //
+        // The result of culling will be used to predict the visible triangles for
+        // the next frame. It will also render all the triangles that were visible
+        // but were not predicted last frame.
+        state.pbr_culling(graph, self);
+
+        // Do the second pass, rendering the residual triangles.
+        state.pbr_render_opaque_residual_triangles(graph, pbr, samples);
+
+        // Render the skybox.
         state.skybox(graph, skybox, samples);
 
-        // Forward rendering transparent
+        // Render all transparent objects.
+        //
+        // This _must_ happen after culling, as all transparent objects are
+        // considered "residual".
         state.pbr_forward_rendering_transparent(graph, pbr, samples);
 
-        // Make the reference to the surface
+        // Tonemap the HDR inner buffer to the output buffer.
         state.tonemapping(graph, tonemapping, target_texture);
     }
 }
@@ -115,15 +184,15 @@ impl BaseRenderGraph {
 /// so desire.
 pub struct BaseRenderGraphIntermediateState {
     pub pre_cull: DataHandle<Buffer>,
-    pub shadow_cull: Vec<DataHandle<culling::DrawCallSet>>,
-    pub cull: DataHandle<culling::DrawCallSet>,
+    pub shadow_cull: Vec<DataHandle<Arc<culling::DrawCallSet>>>,
+    pub cull: DataHandle<Arc<culling::DrawCallSet>>,
 
     pub shadow_uniform_bg: DataHandle<BindGroup>,
     pub forward_uniform_bg: DataHandle<BindGroup>,
     pub shadow: RenderTargetHandle,
     pub color: RenderTargetHandle,
     pub resolve: Option<RenderTargetHandle>,
-    pub depth: RenderTargetHandle,
+    pub depth: DepthTargets,
     pub pre_skinning_buffers: DataHandle<skinning::PreSkinningBuffers>,
 }
 impl BaseRenderGraphIntermediateState {
@@ -146,6 +215,7 @@ impl BaseRenderGraphIntermediateState {
             label: Some("shadow target".into()),
             resolution: eval_output.shadow_target_size,
             depth: 1,
+            mip_levels: Some(1),
             samples: SampleCount::One,
             format: INTERNAL_SHADOW_DEPTH_FORMAT,
             usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING,
@@ -157,6 +227,7 @@ impl BaseRenderGraphIntermediateState {
             resolution,
             depth: 1,
             samples,
+            mip_levels: Some(1),
             format: TextureFormat::Rgba16Float,
             usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING,
         });
@@ -165,19 +236,13 @@ impl BaseRenderGraphIntermediateState {
                 label: Some("hdr resolve".into()),
                 resolution,
                 depth: 1,
+                mip_levels: Some(1),
                 samples: SampleCount::One,
                 format: TextureFormat::Rgba16Float,
                 usage: TextureUsages::RENDER_ATTACHMENT | TextureUsages::TEXTURE_BINDING,
             })
         });
-        let depth = graph.add_render_target(RenderTargetDescriptor {
-            label: Some("hdr depth".into()),
-            resolution,
-            depth: 1,
-            samples,
-            format: TextureFormat::Depth32Float,
-            usage: TextureUsages::RENDER_ATTACHMENT,
-        });
+        let depth = DepthTargets::new(graph, resolution, samples);
 
         let pre_skinning_buffers = graph.add_data::<skinning::PreSkinningBuffers>();
 
@@ -219,6 +284,22 @@ impl BaseRenderGraphIntermediateState {
             resolution,
         );
     }
+    pub fn shadow_object_uniform_upload<'node>(
+        &self,
+        graph: &mut RenderGraph<'node>,
+        base: &'node BaseRenderGraph,
+        eval_output: &InstructionEvaluationOutput,
+    ) {
+        for (shadow_index, shadow) in eval_output.shadows.iter().enumerate() {
+            base.gpu_culler.add_object_uniform_upload_to_graph::<pbr::PbrMaterial>(
+                graph,
+                CameraIndex::Shadow(shadow_index as u32),
+                UVec2::splat(shadow.map.size),
+                SampleCount::One,
+                &format_sso!("Shadow Culling S{}", shadow_index),
+            );
+        }
+    }
 
     /// Does all shadow culling for the PBR materials.
     pub fn pbr_shadow_culling<'node>(&self, graph: &mut RenderGraph<'node>, base: &'node BaseRenderGraph) {
@@ -226,7 +307,8 @@ impl BaseRenderGraphIntermediateState {
             base.gpu_culler.add_culling_to_graph::<pbr::PbrMaterial>(
                 graph,
                 shadow_culled,
-                Some(shadow_index),
+                self.shadow,
+                CameraIndex::Shadow(shadow_index as u32),
                 &format_sso!("Shadow Culling S{}", shadow_index),
             );
         }
@@ -236,15 +318,48 @@ impl BaseRenderGraphIntermediateState {
         skinning::add_skinning_to_graph(graph, &base.gpu_skinner);
     }
 
+    pub fn object_uniform_upload<'node>(
+        &self,
+        graph: &mut RenderGraph<'node>,
+        base: &'node BaseRenderGraph,
+        resolution: UVec2,
+        samples: SampleCount,
+    ) {
+        base.gpu_culler.add_object_uniform_upload_to_graph::<pbr::PbrMaterial>(
+            graph,
+            CameraIndex::Viewport,
+            resolution,
+            samples,
+            "Uniform Bake",
+        );
+    }
+
     /// Does all culling for the forward PBR materials.
     pub fn pbr_culling<'node>(&self, graph: &mut RenderGraph<'node>, base: &'node BaseRenderGraph) {
-        base.gpu_culler
-            .add_culling_to_graph::<pbr::PbrMaterial>(graph, self.cull, None, "Primary Culling");
+        base.gpu_culler.add_culling_to_graph::<pbr::PbrMaterial>(
+            graph,
+            self.cull,
+            self.depth.single_sample_mipped,
+            CameraIndex::Viewport,
+            "Primary Culling",
+        );
+    }
+
+    /// Clear all the targets to their needed values
+    pub fn clear_shadow(&self, graph: &mut RenderGraph<'_>) {
+        crate::clear::add_clear_to_graph(graph, None, None, self.shadow, Vec4::ZERO, 0.0);
     }
 
     /// Clear all the targets to their needed values
     pub fn clear(&self, graph: &mut RenderGraph<'_>, clear_color: Vec4) {
-        crate::clear::add_clear_to_graph(graph, self.color, self.resolve, self.depth, clear_color, 0.0);
+        crate::clear::add_clear_to_graph(
+            graph,
+            Some(self.color),
+            self.resolve,
+            self.depth.rendering_target(),
+            clear_color,
+            0.0,
+        );
     }
 
     /// Render all shadows for the PBR materials.
@@ -261,17 +376,17 @@ impl BaseRenderGraphIntermediateState {
                 routine.add_forward_to_graph(RoutineAddToGraphArgs {
                     graph,
                     whole_frame_uniform_bg: self.shadow_uniform_bg,
-                    culled: *shadow_cull,
+                    culling_output_handle: Some(*shadow_cull),
                     per_material: &pbr.per_material,
                     extra_bgs: None,
                     label: &format!("pbr shadow renderering S{shadow_index}"),
                     samples: SampleCount::One,
+                    camera: CameraIndex::Shadow(shadow_index as u32),
                     color: None,
                     resolve: None,
                     depth: self
                         .shadow
-                        .restrict(0..1, ViewportRect::new(desc.map.offset, UVec2::splat(desc.map.size))),
-                    data: shadow_index as u32,
+                        .set_viewport(ViewportRect::new(desc.map.offset, UVec2::splat(desc.map.size))),
                 });
             }
         }
@@ -289,7 +404,7 @@ impl BaseRenderGraphIntermediateState {
                 graph,
                 self.color,
                 self.resolve,
-                self.depth,
+                self.depth.rendering_target(),
                 self.forward_uniform_bg,
                 samples,
             );
@@ -297,7 +412,7 @@ impl BaseRenderGraphIntermediateState {
     }
 
     /// Render the PBR materials.
-    pub fn pbr_forward_rendering_opaque<'node>(
+    pub fn pbr_render_opaque_predicted_triangles<'node>(
         &self,
         graph: &mut RenderGraph<'node>,
         pbr: &'node pbr::PbrRoutine,
@@ -308,15 +423,40 @@ impl BaseRenderGraphIntermediateState {
             routine.add_forward_to_graph(RoutineAddToGraphArgs {
                 graph,
                 whole_frame_uniform_bg: self.forward_uniform_bg,
-                culled: self.cull,
+                culling_output_handle: None,
                 per_material: &pbr.per_material,
                 extra_bgs: None,
-                label: "PBR Forward",
+                label: "PBR Forward Pass 1",
                 samples,
+                camera: CameraIndex::Viewport,
                 color: Some(self.color),
                 resolve: self.resolve,
-                depth: self.depth,
-                data: 0,
+                depth: self.depth.rendering_target(),
+            });
+        }
+    }
+
+    /// Render the PBR materials.
+    pub fn pbr_render_opaque_residual_triangles<'node>(
+        &self,
+        graph: &mut RenderGraph<'node>,
+        pbr: &'node pbr::PbrRoutine,
+        samples: SampleCount,
+    ) {
+        let routines = [&pbr.opaque_routine, &pbr.cutout_routine];
+        for routine in routines {
+            routine.add_forward_to_graph(RoutineAddToGraphArgs {
+                graph,
+                whole_frame_uniform_bg: self.forward_uniform_bg,
+                culling_output_handle: Some(self.cull),
+                per_material: &pbr.per_material,
+                extra_bgs: None,
+                label: "PBR Forward Pass 2",
+                samples,
+                camera: CameraIndex::Viewport,
+                color: Some(self.color),
+                resolve: self.resolve,
+                depth: self.depth.rendering_target(),
             });
         }
     }
@@ -331,18 +471,22 @@ impl BaseRenderGraphIntermediateState {
         pbr.blend_routine.add_forward_to_graph(RoutineAddToGraphArgs {
             graph,
             whole_frame_uniform_bg: self.forward_uniform_bg,
-            culled: self.cull,
+            culling_output_handle: Some(self.cull),
             per_material: &pbr.per_material,
             extra_bgs: None,
             label: "PBR Forward",
+            camera: CameraIndex::Viewport,
             samples,
             color: Some(self.color),
             resolve: self.resolve,
-            depth: self.depth,
-            data: 0,
+            depth: self.depth.rendering_target(),
         });
     }
 
+    pub fn hi_z<'node>(&self, graph: &mut RenderGraph<'node>, pbr: &'node pbr::PbrRoutine, resolution: UVec2) {
+        pbr.hi_z.add_hi_z_to_graph(graph, self.depth, resolution);
+    }
+
     /// Tonemap onto the given render target.
     pub fn tonemapping<'node>(
         &self,
diff --git a/rend3-routine/src/clear.rs b/rend3-routine/src/clear.rs
index 948cb943..576be017 100644
--- a/rend3-routine/src/clear.rs
+++ b/rend3-routine/src/clear.rs
@@ -9,7 +9,7 @@ use rend3::graph::{
 /// it makes it a lot easier to udnerstand where the clear is coming from.
 pub fn add_clear_to_graph(
     graph: &mut RenderGraph<'_>,
-    color: RenderTargetHandle,
+    color: Option<RenderTargetHandle>,
     resolve: Option<RenderTargetHandle>,
     depth: RenderTargetHandle,
     clear_color: Vec4,
@@ -17,21 +17,25 @@ pub fn add_clear_to_graph(
 ) {
     let mut builder = graph.add_node("Clear");
 
-    let hdr_color_handle = builder.add_render_target(color, NodeResourceUsage::Output);
+    let hdr_color_handle = builder.add_optional_render_target(color, NodeResourceUsage::Output);
     let hdr_resolve = builder.add_optional_render_target(resolve, NodeResourceUsage::Output);
     let hdr_depth_handle = builder.add_render_target(depth, NodeResourceUsage::Output);
 
     let _rpass_handle = builder.add_renderpass(RenderPassTargets {
-        targets: vec![RenderPassTarget {
-            color: hdr_color_handle,
-            clear: wgpu::Color {
-                r: clear_color.x as f64,
-                g: clear_color.y as f64,
-                b: clear_color.z as f64,
-                a: clear_color.w as f64,
-            },
-            resolve: hdr_resolve,
-        }],
+        targets: if let Some(hdr_color_handle) = hdr_color_handle {
+            vec![RenderPassTarget {
+                color: hdr_color_handle,
+                clear: wgpu::Color {
+                    r: clear_color.x as f64,
+                    g: clear_color.y as f64,
+                    b: clear_color.z as f64,
+                    a: clear_color.w as f64,
+                },
+                resolve: hdr_resolve,
+            }]
+        } else {
+            vec![]
+        },
         depth_stencil: Some(RenderPassDepthTarget {
             target: hdr_depth_handle,
             depth_clear: Some(depth_clear),
diff --git a/rend3-routine/src/common/camera.rs b/rend3-routine/src/common/camera.rs
new file mode 100644
index 00000000..6f09f5cc
--- /dev/null
+++ b/rend3-routine/src/common/camera.rs
@@ -0,0 +1,36 @@
+/// Index representing which camera we're referring to.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub enum CameraIndex {
+    Viewport,
+    Shadow(u32),
+}
+
+impl CameraIndex {
+    /// Returns `true` if the camera index is [`Viewport`].
+    ///
+    /// [`Viewport`]: CameraIndex::Viewport
+    #[must_use]
+    pub fn is_viewport(&self) -> bool {
+        matches!(self, Self::Viewport)
+    }
+
+    /// Returns `true` if the camera index is [`Shadow`].
+    ///
+    /// [`Shadow`]: CameraIndex::Shadow
+    #[must_use]
+    pub fn is_shadow(&self) -> bool {
+        matches!(self, Self::Shadow(..))
+    }
+
+    /// Returns a shader compatible index for the camera, using u32::MAX for the viewport camera.
+    #[must_use]
+    pub fn to_shader_index(&self) -> u32 {
+        match *self {
+            Self::Viewport => u32::MAX,
+            Self::Shadow(index) => {
+                assert_ne!(index, u32::MAX, "Shadow camera index cannot be 0xFFFF_FFFF");
+                index
+            }
+        }
+    }
+}
diff --git a/rend3-routine/src/common/interfaces.rs b/rend3-routine/src/common/interfaces.rs
index 69ca73b1..cef01b7e 100644
--- a/rend3-routine/src/common/interfaces.rs
+++ b/rend3-routine/src/common/interfaces.rs
@@ -123,6 +123,15 @@ impl<M: Material> PerMaterialArchetypeInterface<M> {
                 },
                 None,
             )
+            .append(
+                ShaderStages::VERTEX_FRAGMENT,
+                BindingType::Buffer {
+                    ty: BufferBindingType::Storage { read_only: true },
+                    has_dynamic_offset: false,
+                    min_binding_size: None,
+                },
+                None,
+            )
             .build(device, Some("per material bgl"));
 
         Self {
diff --git a/rend3-routine/src/common/mod.rs b/rend3-routine/src/common/mod.rs
index f8a2bed8..6acc7257 100644
--- a/rend3-routine/src/common/mod.rs
+++ b/rend3-routine/src/common/mod.rs
@@ -1,7 +1,9 @@
 //! Common utilities used throughout the crate.
 
+mod camera;
 mod interfaces;
 mod samplers;
 
+pub use camera::*;
 pub use interfaces::*;
 pub use samplers::*;
diff --git a/rend3-routine/src/culling/batching.rs b/rend3-routine/src/culling/batching.rs
index 84eb924f..cff60c9d 100644
--- a/rend3-routine/src/culling/batching.rs
+++ b/rend3-routine/src/culling/batching.rs
@@ -1,13 +1,16 @@
-use std::cmp::Ordering;
+use std::{cmp::Ordering, collections::HashMap};
 
 use encase::ShaderType;
 use ordered_float::OrderedFloat;
 use rend3::{
-    managers::{CameraManager, MaterialManager, ObjectManager, TextureBindGroupIndex},
-    types::{Material, SortingOrder, SortingReason},
-    util::math::round_up,
+    graph::NodeExecutionContext,
+    managers::{CameraManager, TextureBindGroupIndex},
+    types::{GraphDataHandle, Material, RawObjectHandle, SortingOrder, SortingReason},
+    util::{math::round_up, typedefs::FastHashMap},
 };
 
+use crate::common::CameraIndex;
+
 use super::{BATCH_SIZE, WORKGROUP_SIZE};
 
 #[derive(Debug)]
@@ -19,8 +22,6 @@ pub struct ShaderBatchDatas {
 #[derive(Debug)]
 pub(super) struct JobSubRegion {
     pub job_index: u32,
-    pub base_invocation: u32,
-    pub invocation_count: u32,
     pub key: ShaderJobKey,
 }
 
@@ -80,48 +81,74 @@ impl Ord for ShaderJobSortingKey {
 #[derive(Debug, ShaderType)]
 pub struct ShaderBatchData {
     #[align(256)]
-    pub(super) ranges: [ShaderObjectRange; BATCH_SIZE],
     pub(super) total_objects: u32,
     pub(super) total_invocations: u32,
-    pub(super) base_output_invocation: u32,
+    pub(super) batch_base_invocation: u32,
+    pub(super) object_culling_information: [ShaderObjectCullingInformation; BATCH_SIZE],
 }
 
 #[derive(Debug, Copy, Clone, Default, ShaderType)]
-pub(super) struct ShaderObjectRange {
+pub(super) struct ShaderObjectCullingInformation {
     pub invocation_start: u32,
     pub invocation_end: u32,
     pub object_id: u32,
+    pub region_id: u32,
+    pub base_region_invocation: u32,
+    pub local_region_id: u32,
+    pub previous_global_invocation: u32,
+    pub atomic_capable: u32,
+}
+
+/// Map containing the previous invocation of each object.
+pub struct PerCameraPreviousInvocationsMap {
+    inner: FastHashMap<CameraIndex, FastHashMap<RawObjectHandle, u32>>,
+}
+impl PerCameraPreviousInvocationsMap {
+    pub fn new() -> Self {
+        Self {
+            inner: HashMap::default(),
+        }
+    }
+
+    pub fn get_and_reset_camera(&mut self, camera: CameraIndex) -> FastHashMap<RawObjectHandle, u32> {
+        self.inner.remove(&camera).unwrap_or_default()
+    }
+
+    pub fn set_camera(&mut self, camera: CameraIndex, previous_invocations: FastHashMap<RawObjectHandle, u32>) {
+        self.inner.insert(camera, previous_invocations);
+    }
 }
 
 pub(super) fn batch_objects<M: Material>(
-    material_manager: &MaterialManager,
-    object_manager: &ObjectManager,
-    camera_manager: &CameraManager,
-    max_dispatch: u32,
+    ctx: &mut NodeExecutionContext,
+    previous_invocation_map_handle: &GraphDataHandle<PerCameraPreviousInvocationsMap>,
+    camera: &CameraManager,
+    camera_idx: CameraIndex,
 ) -> ShaderBatchDatas {
     profiling::scope!("Batch Objects");
 
+    let mut per_camera_previous_invocation_map = ctx.data_core.graph_storage.get_mut(previous_invocation_map_handle);
+    let previous_invocation_map = per_camera_previous_invocation_map.get_and_reset_camera(camera_idx);
+    let mut current_invocation_map = FastHashMap::default();
+
     let mut jobs = ShaderBatchDatas {
         jobs: Vec::new(),
         regions: Vec::new(),
     };
 
-    let objects = match object_manager.enumerated_objects::<M>() {
+    let objects = match ctx.data_core.object_manager.enumerated_objects::<M>() {
         Some(o) => o,
         None => return jobs,
     };
 
-    let material_archetype = material_manager.archetype_view::<M>();
+    let material_archetype = ctx.data_core.material_manager.archetype_view::<M>();
 
     let mut sorted_objects = Vec::with_capacity(objects.len());
     {
         profiling::scope!("Sort Key Creation");
         for (handle, object) in objects {
             // Frustum culling
-            if !camera_manager
-                .world_frustum()
-                .contains_sphere(object.inner.bounding_sphere)
-            {
+            if !camera.world_frustum().contains_sphere(object.inner.bounding_sphere) {
                 continue;
             }
 
@@ -134,7 +161,11 @@ pub(super) fn batch_objects<M: Material>(
             let material_key = material.inner.key();
             let sorting = material.inner.sorting();
 
-            let mut distance_sq = camera_manager.location().distance_squared(object.location.into());
+            let mut distance_sq = ctx
+                .data_core
+                .camera_manager
+                .location()
+                .distance_squared(object.location.into());
             if sorting.order == SortingOrder::BackToFront {
                 distance_sq = -distance_sq;
             }
@@ -160,36 +191,49 @@ pub(super) fn batch_objects<M: Material>(
 
     if !sorted_objects.is_empty() {
         profiling::scope!("Batch Data Creation");
+        let mut current_region_idx = 0_u32;
+        let mut current_region_object_index = 0_u32;
         let mut current_base_invocation = 0_u32;
         let mut current_region_invocation = 0_u32;
         let mut current_invocation = 0_u32;
         let mut current_object_index = 0_u32;
-        let mut current_ranges = [ShaderObjectRange::default(); BATCH_SIZE];
+        let mut current_ranges = [ShaderObjectCullingInformation::default(); BATCH_SIZE];
         let mut current_key = sorted_objects.first().unwrap().0.job_key;
 
-        for (ShaderJobSortingKey { job_key: key, .. }, handle, object) in sorted_objects {
+        let max_dispatch_count = ctx.renderer.limits.max_compute_workgroups_per_dimension;
+
+        for (
+            ShaderJobSortingKey {
+                job_key: key,
+                sorting_reason,
+                ..
+            },
+            handle,
+            object,
+        ) in sorted_objects
+        {
             let invocation_count = object.inner.index_count / 3;
 
             let key_difference = key != current_key;
             let object_limit = current_object_index == 256;
-            let dispatch_limit = (current_invocation + invocation_count) >= max_dispatch * WORKGROUP_SIZE;
+            let dispatch_limit = (current_invocation + invocation_count) >= max_dispatch_count * WORKGROUP_SIZE;
 
             if key_difference || object_limit || dispatch_limit {
                 jobs.regions.push(JobSubRegion {
                     job_index: jobs.jobs.len() as u32,
-                    base_invocation: current_region_invocation,
-                    invocation_count: current_invocation - current_region_invocation,
                     key: current_key,
                 });
+                current_region_idx += 1;
                 current_key = key;
+                current_region_object_index = 0;
                 current_region_invocation = current_invocation;
             }
             if object_limit || dispatch_limit {
                 jobs.jobs.push(ShaderBatchData {
-                    ranges: current_ranges,
+                    object_culling_information: current_ranges,
                     total_objects: current_object_index,
                     total_invocations: current_invocation,
-                    base_output_invocation: current_base_invocation,
+                    batch_base_invocation: current_base_invocation,
                 });
 
                 current_base_invocation += current_invocation;
@@ -198,30 +242,38 @@ pub(super) fn batch_objects<M: Material>(
                 current_object_index = 0;
             }
 
-            let range = ShaderObjectRange {
+            let range = ShaderObjectCullingInformation {
                 invocation_start: current_invocation,
                 invocation_end: current_invocation + invocation_count,
+                region_id: current_region_idx,
                 object_id: handle.idx as u32,
+                base_region_invocation: current_region_invocation,
+                local_region_id: current_region_object_index,
+                previous_global_invocation: previous_invocation_map.get(&handle).copied().unwrap_or(u32::MAX),
+                atomic_capable: matches!(sorting_reason, SortingReason::Optimization) as u32,
             };
 
+            current_invocation_map.insert(handle, current_invocation + current_base_invocation);
+
             current_ranges[current_object_index as usize] = range;
             current_object_index += 1;
+            current_region_object_index += 1;
             current_invocation += round_up(invocation_count, WORKGROUP_SIZE);
         }
 
         jobs.regions.push(JobSubRegion {
             job_index: jobs.jobs.len() as u32,
-            base_invocation: current_region_invocation,
-            invocation_count: current_invocation - current_region_invocation,
             key: current_key,
         });
         jobs.jobs.push(ShaderBatchData {
-            ranges: current_ranges,
+            object_culling_information: current_ranges,
             total_objects: current_object_index,
             total_invocations: current_invocation,
-            base_output_invocation: current_base_invocation,
+            batch_base_invocation: current_base_invocation,
         });
     }
 
+    per_camera_previous_invocation_map.set_camera(camera_idx, current_invocation_map);
+
     jobs
 }
diff --git a/rend3-routine/src/culling/culler.rs b/rend3-routine/src/culling/culler.rs
index 68e0f8ab..bf4dcd7c 100644
--- a/rend3-routine/src/culling/culler.rs
+++ b/rend3-routine/src/culling/culler.rs
@@ -8,110 +8,191 @@ use std::{
 };
 
 use encase::{ShaderSize, ShaderType, StorageBuffer};
+use glam::{Mat4, UVec2, Vec2};
 use rend3::{
     format_sso,
-    graph::{DataHandle, NodeExecutionContext, NodeResourceUsage, RenderGraph},
-    managers::{ShaderObject, TextureBindGroupIndex},
-    types::{GraphDataHandle, Material},
-    util::{
-        math::{round_up, round_up_div},
-        typedefs::FastHashMap,
-    },
+    graph::{DataHandle, DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderTargetHandle},
+    managers::{CameraManager, ShaderObject, TextureBindGroupIndex},
+    types::{GraphDataHandle, Material, MaterialArray, SampleCount, VERTEX_ATTRIBUTE_POSITION},
+    util::{frustum::Frustum, math::IntegerExt, typedefs::FastHashMap},
     Renderer, ShaderPreProcessor, ShaderVertexBufferConfig,
 };
 use wgpu::{
-    self, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, BindGroupLayoutEntry,
-    BindingType, Buffer, BufferBinding, BufferBindingType, BufferDescriptor, BufferUsages, ComputePassDescriptor,
-    ComputePipeline, ComputePipelineDescriptor, Device, PipelineLayoutDescriptor, ShaderModuleDescriptor, ShaderStages,
+    self, AddressMode, BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor,
+    BindGroupLayoutEntry, BindingResource, BindingType, Buffer, BufferBinding, BufferBindingType, BufferDescriptor,
+    BufferUsages, CommandEncoder, ComputePassDescriptor, ComputePipeline, ComputePipelineDescriptor, Device,
+    FilterMode, PipelineLayoutDescriptor, Queue, Sampler, SamplerBindingType, SamplerDescriptor,
+    ShaderModuleDescriptor, ShaderStages, TextureSampleType, TextureViewDimension,
 };
 
-use crate::culling::{
-    batching::{batch_objects, JobSubRegion, ShaderBatchData, ShaderBatchDatas},
-    WORKGROUP_SIZE,
+use crate::{
+    common::CameraIndex,
+    culling::{
+        batching::{batch_objects, JobSubRegion, PerCameraPreviousInvocationsMap, ShaderBatchData, ShaderBatchDatas},
+        suballoc::InputOutputBuffer,
+        WORKGROUP_SIZE,
+    },
 };
 
-// 16 MB of indices
-const OUTPUT_BUFFER_ROUNDING_SIZE: u64 = 1 << 24;
-// At least 64 batches
-const BATCH_DATA_ROUNDING_SIZE: u64 = ShaderBatchData::SHADER_SIZE.get() * 64;
-
 #[derive(Debug)]
 pub struct DrawCallSet {
-    pub buffers: CullingBuffers<Arc<Buffer>>,
+    pub culling_data_buffer: Buffer,
+    pub per_camera_uniform: Arc<Buffer>,
     pub draw_calls: Vec<DrawCall>,
     /// Range of draw calls in the draw call array corresponding to a given material key.
     pub material_key_ranges: HashMap<u64, Range<usize>>,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct DrawCall {
     pub bind_group_index: TextureBindGroupIndex,
-    pub index_range: Range<u32>,
     pub batch_index: u32,
 }
 
 #[derive(Default)]
-struct CullingBufferMap {
-    inner: FastHashMap<Option<usize>, CullingBuffers<Arc<Buffer>>>,
+pub struct CullingBufferMap {
+    inner: FastHashMap<CameraIndex, CullingBuffers>,
 }
 impl CullingBufferMap {
-    fn get_buffers(
+    pub fn get_buffers(&self, camera: CameraIndex) -> Option<&CullingBuffers> {
+        self.inner.get(&camera)
+    }
+
+    fn get_or_resize_buffers(
         &mut self,
+        queue: &Queue,
         device: &Device,
-        camera: Option<usize>,
-        mut sizes: CullingBuffers<u64>,
-    ) -> &CullingBuffers<Arc<Buffer>> {
-        sizes.object_reference = round_up(sizes.object_reference.max(1), BATCH_DATA_ROUNDING_SIZE);
-        sizes.index = round_up(sizes.index.max(1), OUTPUT_BUFFER_ROUNDING_SIZE);
-
+        encoder: &mut CommandEncoder,
+        camera: CameraIndex,
+        sizes: CullingBufferSizes,
+    ) -> &mut CullingBuffers {
         match self.inner.entry(camera) {
             Entry::Occupied(b) => {
                 let b = b.into_mut();
 
-                let current_size = CullingBuffers {
-                    object_reference: b.object_reference.size(),
-                    index: b.index.size(),
-                };
-                if current_size != sizes {
-                    *b = CullingBuffers::new(device, sizes);
-                }
+                b.update_sizes(queue, device, encoder, sizes);
+
                 b
             }
-            Entry::Vacant(b) => b.insert(CullingBuffers::new(device, sizes)),
+            Entry::Vacant(b) => b.insert(CullingBuffers::new(device, queue, sizes)),
         }
     }
 }
 
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub struct CullingBuffers<T> {
-    pub object_reference: T,
-    pub index: T,
+struct CullingBufferSizes {
+    invocations: u64,
+    draw_calls: u64,
 }
 
-impl CullingBuffers<Arc<Buffer>> {
-    pub fn new(device: &Device, sizes: CullingBuffers<u64>) -> Self {
-        CullingBuffers {
-            object_reference: Arc::new(device.create_buffer(&BufferDescriptor {
-                label: None,
-                size: sizes.object_reference,
-                usage: BufferUsages::STORAGE | BufferUsages::COPY_DST,
-                mapped_at_creation: false,
-            })),
-            index: Arc::new(device.create_buffer(&BufferDescriptor {
-                label: None,
-                size: sizes.index,
-                usage: BufferUsages::STORAGE | BufferUsages::INDEX,
-                mapped_at_creation: false,
-            })),
+#[derive(Debug)]
+pub struct CullingBuffers {
+    pub index_buffer: InputOutputBuffer,
+    pub draw_call_buffer: InputOutputBuffer,
+    pub culling_results_buffer: InputOutputBuffer,
+}
+
+impl CullingBuffers {
+    fn new(device: &Device, queue: &Queue, sizes: CullingBufferSizes) -> Self {
+        Self {
+            // One element per triangle/invocation
+            index_buffer: InputOutputBuffer::new(device, queue, sizes.invocations, "Index Buffer", 4, 4, false),
+            draw_call_buffer: InputOutputBuffer::new(device, queue, sizes.draw_calls, "Draw Call Buffer", 20, 4, true),
+            culling_results_buffer: InputOutputBuffer::new(
+                device,
+                queue,
+                // 32 bits in a u32
+                sizes.invocations.div_round_up(u32::BITS as _),
+                "Culling Results Buffer",
+                4,
+                4,
+                false,
+            ),
+        }
+    }
+
+    fn update_sizes(
+        &mut self,
+        queue: &Queue,
+        device: &Device,
+        encoder: &mut CommandEncoder,
+        sizes: CullingBufferSizes,
+    ) {
+        self.index_buffer.swap(queue, device, encoder, sizes.invocations * 3);
+        self.draw_call_buffer.swap(queue, device, encoder, sizes.draw_calls);
+        self.culling_results_buffer
+            .swap(queue, device, encoder, sizes.invocations.div_round_up(32));
+    }
+}
+
+#[derive(Debug, Copy, Clone)]
+pub enum TriangleVisibility {
+    PositiveAreaVisible,
+    NegativeAreaVisible,
+}
+
+impl TriangleVisibility {
+    fn from_winding_and_face(winding: wgpu::FrontFace, culling: wgpu::Face) -> Self {
+        match (winding, culling) {
+            (wgpu::FrontFace::Ccw, wgpu::Face::Back) => TriangleVisibility::PositiveAreaVisible,
+            (wgpu::FrontFace::Ccw, wgpu::Face::Front) => TriangleVisibility::NegativeAreaVisible,
+            (wgpu::FrontFace::Cw, wgpu::Face::Back) => TriangleVisibility::NegativeAreaVisible,
+            (wgpu::FrontFace::Cw, wgpu::Face::Front) => TriangleVisibility::PositiveAreaVisible,
         }
     }
+
+    fn is_positive(self) -> bool {
+        match self {
+            TriangleVisibility::PositiveAreaVisible => true,
+            TriangleVisibility::NegativeAreaVisible => false,
+        }
+    }
+}
+
+bitflags::bitflags! {
+    struct PerCameraUniformFlags: u32 {
+        const POSTIIVE_AREA_VISIBLE = 1 << 0;
+        const MULTISAMPLED = 1 << 1;
+    }
+}
+
+#[derive(ShaderType)]
+struct PerCameraUniform {
+    // TODO: use less space
+    view: Mat4,
+    // TODO: use less space
+    view_proj: Mat4,
+    // The index of which shadow caster we are rendering for.
+    //
+    // This will be u32::MAX if we're rendering for a camera, not a shadow map.
+    shadow_index: u32,
+    frustum: Frustum,
+    resolution: Vec2,
+    // Created from PerCameraUniformFlags
+    flags: u32,
+    object_count: u32,
+    #[size(runtime)]
+    objects: Vec<PerCameraUniformObjectData>,
+}
+
+#[derive(ShaderType)]
+struct PerCameraUniformObjectData {
+    // TODO: use less space
+    model_view: Mat4,
+    // TODO: use less space
+    model_view_proj: Mat4,
 }
 
 pub struct GpuCuller {
-    bgl: BindGroupLayout,
-    pipeline: ComputePipeline,
+    prep_bgl: BindGroupLayout,
+    prep_pipeline: ComputePipeline,
+    culling_bgl: BindGroupLayout,
+    culling_pipeline: ComputePipeline,
+    sampler: Sampler,
+    winding: wgpu::FrontFace,
     type_id: TypeId,
-    culling_buffer_map_handle: GraphDataHandle<CullingBufferMap>,
+    per_material_buffer_handle: GraphDataHandle<HashMap<CameraIndex, Arc<Buffer>>>,
+    pub culling_buffer_map_handle: GraphDataHandle<CullingBufferMap>,
+    previous_invocation_map_handle: GraphDataHandle<PerCameraPreviousInvocationsMap>,
 }
 
 impl GpuCuller {
@@ -121,22 +202,85 @@ impl GpuCuller {
     {
         let type_name = type_name::<M>();
 
-        let source = spp
+        let prep_source = spp
             .render_shader(
-                "rend3-routine/cull.wgsl",
+                "rend3-routine/uniform_prep.wgsl",
                 &(),
                 Some(&ShaderVertexBufferConfig::from_material::<M>()),
             )
             .unwrap();
 
-        let sm = renderer.device.create_shader_module(ShaderModuleDescriptor {
+        let prep_sm = renderer.device.create_shader_module(ShaderModuleDescriptor {
+            label: Some(&format_sso!("UniformPrep {type_name} SM")),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(prep_source)),
+        });
+
+        let prep_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor {
+            label: Some(&format_sso!("UniformPrep {type_name} BGL")),
+            entries: &[
+                // Object
+                BindGroupLayoutEntry {
+                    binding: 0,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(ShaderObject::<M>::SHADER_SIZE),
+                    },
+                    count: None,
+                },
+                // Object
+                BindGroupLayoutEntry {
+                    binding: 1,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(PerCameraUniform::min_size()),
+                    },
+                    count: None,
+                },
+            ],
+        });
+
+        let prep_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor {
+            label: Some(&format_sso!("UniformPrep {type_name} PLL")),
+            bind_group_layouts: &[&prep_bgl],
+            push_constant_ranges: &[],
+        });
+
+        let prep_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor {
+            label: Some(&format_sso!("UniformPrep {type_name} PLL")),
+            layout: Some(&prep_pll),
+            module: &prep_sm,
+            entry_point: "cs_main",
+        });
+
+        let position_offset = M::supported_attributes()
+            .into_iter()
+            .enumerate()
+            .find_map(|(idx, a)| (*a == *VERTEX_ATTRIBUTE_POSITION).then_some(idx))
+            .unwrap();
+
+        let culling_source = spp
+            .render_shader(
+                "rend3-routine/cull.wgsl",
+                &serde_json::json! {{
+                    "position_attribute_offset": position_offset,
+                }},
+                Some(&ShaderVertexBufferConfig::from_material::<M>()),
+            )
+            .unwrap();
+
+        let culling_sm = renderer.device.create_shader_module(ShaderModuleDescriptor {
             label: Some(&format_sso!("GpuCuller {type_name} SM")),
-            source: wgpu::ShaderSource::Wgsl(Cow::Owned(source)),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(culling_source)),
         });
 
-        let bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor {
+        let culling_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor {
             label: Some(&format_sso!("GpuCuller {type_name} BGL")),
             entries: &[
+                // Vertex Buffer
                 BindGroupLayoutEntry {
                     binding: 0,
                     visibility: ShaderStages::COMPUTE,
@@ -147,6 +291,7 @@ impl GpuCuller {
                     },
                     count: None,
                 },
+                // Object Buffer
                 BindGroupLayoutEntry {
                     binding: 1,
                     visibility: ShaderStages::COMPUTE,
@@ -157,6 +302,7 @@ impl GpuCuller {
                     },
                     count: None,
                 },
+                // Batch data
                 BindGroupLayoutEntry {
                     binding: 2,
                     visibility: ShaderStages::COMPUTE,
@@ -167,43 +313,243 @@ impl GpuCuller {
                     },
                     count: None,
                 },
+                // Draw Calls
                 BindGroupLayoutEntry {
                     binding: 3,
                     visibility: ShaderStages::COMPUTE,
                     ty: BindingType::Buffer {
                         ty: BufferBindingType::Storage { read_only: false },
                         has_dynamic_offset: false,
-                        min_binding_size: Some(NonZeroU64::new(4).unwrap()),
+                        min_binding_size: Some(NonZeroU64::new(20 + 8).unwrap()),
+                    },
+                    count: None,
+                },
+                // Index buffer
+                BindGroupLayoutEntry {
+                    binding: 4,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()),
+                    },
+                    count: None,
+                },
+                // Culling Results
+                BindGroupLayoutEntry {
+                    binding: 5,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: false },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(NonZeroU64::new(4 + 8).unwrap()),
                     },
                     count: None,
                 },
+                // per camera uniforms
+                BindGroupLayoutEntry {
+                    binding: 6,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Buffer {
+                        ty: BufferBindingType::Storage { read_only: true },
+                        has_dynamic_offset: false,
+                        min_binding_size: Some(PerCameraUniform::min_size()),
+                    },
+                    count: None,
+                },
+                // hirearchical z buffer
+                BindGroupLayoutEntry {
+                    binding: 7,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Texture {
+                        sample_type: TextureSampleType::Depth,
+                        view_dimension: TextureViewDimension::D2,
+                        multisampled: false,
+                    },
+                    count: None,
+                },
+                // hirearchical z buffer
+                BindGroupLayoutEntry {
+                    binding: 8,
+                    visibility: ShaderStages::COMPUTE,
+                    ty: BindingType::Sampler(SamplerBindingType::NonFiltering),
+                    count: None,
+                },
             ],
         });
 
-        let pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor {
+        let culling_pll = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor {
             label: Some(&format_sso!("GpuCuller {type_name} PLL")),
-            bind_group_layouts: &[&bgl],
+            bind_group_layouts: &[&culling_bgl],
             push_constant_ranges: &[],
         });
 
-        let pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor {
+        let culling_pipeline = renderer.device.create_compute_pipeline(&ComputePipelineDescriptor {
             label: Some(&format_sso!("GpuCuller {type_name} PLL")),
-            layout: Some(&pll),
-            module: &sm,
+            layout: Some(&culling_pll),
+            module: &culling_sm,
             entry_point: "cs_main",
         });
 
+        let sampler = renderer.device.create_sampler(&SamplerDescriptor {
+            label: Some("HiZ Sampler"),
+            address_mode_u: AddressMode::ClampToEdge,
+            address_mode_v: AddressMode::ClampToEdge,
+            address_mode_w: AddressMode::ClampToEdge,
+            mag_filter: FilterMode::Nearest,
+            min_filter: FilterMode::Nearest,
+            mipmap_filter: FilterMode::Nearest,
+            lod_min_clamp: 0.0,
+            lod_max_clamp: 100.0,
+            compare: None,
+            anisotropy_clamp: 1,
+            border_color: None,
+        });
+
+        let per_material_buffer_handle = renderer.add_graph_data(HashMap::default());
         let culling_buffer_map_handle = renderer.add_graph_data(CullingBufferMap::default());
+        let previous_invocation_map_handle = renderer.add_graph_data(PerCameraPreviousInvocationsMap::new());
 
         Self {
-            bgl,
-            pipeline,
+            prep_bgl,
+            prep_pipeline,
+            culling_bgl,
+            culling_pipeline,
+            sampler,
+            winding: renderer.handedness.into(),
             type_id: TypeId::of::<M>(),
+            per_material_buffer_handle,
             culling_buffer_map_handle,
+            previous_invocation_map_handle,
         }
     }
 
-    pub fn cull<M>(&self, ctx: &mut NodeExecutionContext, jobs: ShaderBatchDatas, camera: Option<usize>) -> DrawCallSet
+    pub fn object_uniform_upload<M>(
+        &self,
+        ctx: &mut NodeExecutionContext,
+        camera: &CameraManager,
+        camera_idx: CameraIndex,
+        resolution: UVec2,
+        samples: SampleCount,
+    ) where
+        M: Material,
+    {
+        profiling::scope!("GpuCuller::object_uniform_upload");
+
+        assert_eq!(TypeId::of::<M>(), self.type_id);
+
+        let type_name = type_name::<M>();
+
+        let encoder = ctx.encoder_or_pass.take_encoder();
+
+        // TODO: Isolate all this into a struct
+        let max_object_count = ctx
+            .data_core
+            .object_manager
+            .buffer::<M>()
+            .map(wgpu::Buffer::size)
+            .unwrap_or(0)
+            / ShaderObject::<M>::SHADER_SIZE.get();
+
+        if max_object_count == 0 {
+            return;
+        }
+
+        let per_map_buffer_size = ((max_object_count - 1) * PerCameraUniformObjectData::SHADER_SIZE.get())
+            + PerCameraUniform::min_size().get();
+
+        let mut per_mat_buffer_map = ctx.data_core.graph_storage.get_mut(&self.per_material_buffer_handle);
+
+        let new_per_mat_buffer = || {
+            Arc::new(ctx.renderer.device.create_buffer(&BufferDescriptor {
+                label: None,
+                size: per_map_buffer_size,
+                usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
+                mapped_at_creation: false,
+            }))
+        };
+        let buffer = match per_mat_buffer_map.entry(camera_idx) {
+            Entry::Occupied(o) => {
+                let r = o.into_mut();
+                if r.size() != per_map_buffer_size {
+                    *r = new_per_mat_buffer();
+                }
+                r
+            }
+            Entry::Vacant(o) => o.insert(new_per_mat_buffer()),
+        };
+
+        let culling = match camera_idx {
+            CameraIndex::Shadow(_) => wgpu::Face::Front,
+            CameraIndex::Viewport => wgpu::Face::Back,
+        };
+
+        {
+            // We don't write anything in the objects right now, as this will be filled in by the preparation compute shader
+            profiling::scope!("PerCameraUniform Data Upload");
+            let per_camera_data = PerCameraUniform {
+                view: camera.view(),
+                view_proj: camera.view_proj(),
+                shadow_index: camera_idx.to_shader_index(),
+                frustum: camera.world_frustum(),
+                resolution: resolution.as_vec2(),
+                flags: {
+                    let mut flags = PerCameraUniformFlags::empty();
+                    flags.set(
+                        PerCameraUniformFlags::POSTIIVE_AREA_VISIBLE,
+                        TriangleVisibility::from_winding_and_face(self.winding, culling).is_positive(),
+                    );
+                    flags.set(PerCameraUniformFlags::MULTISAMPLED, samples != SampleCount::One);
+                    flags.bits()
+                },
+                object_count: max_object_count as u32,
+                objects: Vec::new(),
+            };
+            let mut buffer = ctx
+                .renderer
+                .queue
+                .write_buffer_with(buffer, 0, per_camera_data.size())
+                .unwrap();
+            StorageBuffer::new(&mut *buffer).write(&per_camera_data).unwrap();
+        }
+
+        let Some(object_manager_buffer) = ctx.data_core.object_manager.buffer::<M>() else {
+            return;
+        };
+        let prep_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor {
+            label: Some(&format_sso!("UniformPrep {type_name} BG")),
+            layout: &self.prep_bgl,
+            entries: &[
+                BindGroupEntry {
+                    binding: 0,
+                    resource: object_manager_buffer.as_entire_binding(),
+                },
+                BindGroupEntry {
+                    binding: 1,
+                    resource: buffer.as_entire_binding(),
+                },
+            ],
+        });
+
+        profiling::scope!("Command Encoding");
+
+        let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor {
+            label: Some(&format_sso!("GpuCuller {type_name} uniform bake")),
+            timestamp_writes: None,
+        });
+        cpass.set_pipeline(&self.prep_pipeline);
+        cpass.set_bind_group(0, &prep_bg, &[]);
+        cpass.dispatch_workgroups((max_object_count as u32).div_round_up(WORKGROUP_SIZE), 1, 1);
+        drop(cpass);
+    }
+
+    pub fn cull<M>(
+        &self,
+        ctx: &mut NodeExecutionContext,
+        jobs: ShaderBatchDatas,
+        depth_handle: DeclaredDependency<RenderTargetHandle>,
+        camera_idx: CameraIndex,
+    ) -> DrawCallSet
     where
         M: Material,
     {
@@ -217,38 +563,56 @@ impl GpuCuller {
             .jobs
             .iter()
             .map(|j: &ShaderBatchData| {
-                debug_assert_eq!(j.total_invocations % 256, 0);
+                debug_assert_eq!(j.total_invocations % WORKGROUP_SIZE, 0);
                 j.total_invocations
             })
             .sum();
 
-        let buffers = ctx
-            .data_core
-            .graph_storage
-            .get_mut(&self.culling_buffer_map_handle)
-            .get_buffers(
-                &ctx.renderer.device,
-                camera,
-                CullingBuffers {
-                    object_reference: jobs.jobs.size().get(),
-                    index: <u64 as Ord>::max(total_invocations as u64 * 3 * 4, 4),
-                },
-            )
-            .clone();
+        let encoder = ctx.encoder_or_pass.take_encoder();
+
+        let mut culling_buffer_map = ctx.data_core.graph_storage.get_mut(&self.culling_buffer_map_handle);
+        let buffers = culling_buffer_map.get_or_resize_buffers(
+            &ctx.renderer.queue,
+            &ctx.renderer.device,
+            encoder,
+            camera_idx,
+            CullingBufferSizes {
+                invocations: total_invocations as u64,
+                draw_calls: jobs.regions.len() as u64,
+            },
+        );
+
+        let per_camera_uniform = Arc::clone(
+            ctx.data_core
+                .graph_storage
+                .get_mut(&self.per_material_buffer_handle)
+                .get(&camera_idx)
+                .unwrap_or_else(|| panic!("No per camera uniform for camera {:?}", camera_idx)),
+        );
+
+        let culling_data_buffer = {
+            profiling::scope!("Culling Job Data Upload");
+
+            let culling_data_buffer = ctx.renderer.device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some("Culling Data Buffer"),
+                size: jobs.jobs.size().get(),
+                usage: wgpu::BufferUsages::STORAGE,
+                mapped_at_creation: true,
+            });
 
-        {
-            profiling::scope!("Culling Job Data Format");
-            let mut buffer = ctx
-                .renderer
-                .queue
-                .write_buffer_with(&buffers.object_reference, 0, jobs.jobs.size())
-                .unwrap();
-            StorageBuffer::new(&mut *buffer).write(&jobs.jobs).unwrap();
-        }
+            let mut mapping = culling_data_buffer.slice(..).get_mapped_range_mut();
+            StorageBuffer::new(&mut *mapping).write(&jobs.jobs).unwrap();
+            drop(mapping);
+            culling_data_buffer.unmap();
 
-        let bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor {
+            culling_data_buffer
+        };
+
+        let hi_z_buffer = ctx.graph_data.get_render_target(depth_handle);
+
+        let culling_bg = ctx.renderer.device.create_bind_group(&BindGroupDescriptor {
             label: Some(&format_sso!("GpuCuller {type_name} BG")),
-            layout: &self.bgl,
+            layout: &self.culling_bgl,
             entries: &[
                 BindGroupEntry {
                     binding: 0,
@@ -260,15 +624,35 @@ impl GpuCuller {
                 },
                 BindGroupEntry {
                     binding: 2,
-                    resource: wgpu::BindingResource::Buffer(BufferBinding {
-                        buffer: &buffers.object_reference,
+                    resource: BindingResource::Buffer(BufferBinding {
+                        buffer: &culling_data_buffer,
                         offset: 0,
                         size: Some(ShaderBatchData::SHADER_SIZE),
                     }),
                 },
                 BindGroupEntry {
                     binding: 3,
-                    resource: buffers.index.as_entire_binding(),
+                    resource: buffers.draw_call_buffer.as_entire_binding(),
+                },
+                BindGroupEntry {
+                    binding: 4,
+                    resource: buffers.index_buffer.as_entire_binding(),
+                },
+                BindGroupEntry {
+                    binding: 5,
+                    resource: buffers.culling_results_buffer.as_entire_binding(),
+                },
+                BindGroupEntry {
+                    binding: 6,
+                    resource: per_camera_uniform.as_entire_binding(),
+                },
+                BindGroupEntry {
+                    binding: 7,
+                    resource: BindingResource::TextureView(hi_z_buffer),
+                },
+                BindGroupEntry {
+                    binding: 8,
+                    resource: BindingResource::Sampler(&self.sampler),
                 },
             ],
         });
@@ -289,11 +673,7 @@ impl GpuCuller {
                 current_material_key_range_start = range_end;
             }
 
-            let job = &jobs.jobs[region.job_index as usize];
-            let start = (job.base_output_invocation + region.base_invocation) * 3;
-            let end = start + (region.invocation_count * 3);
             draw_calls.push(DrawCall {
-                index_range: start..end,
                 bind_group_index: region.key.bind_group_index,
                 batch_index: region.job_index,
             });
@@ -301,57 +681,82 @@ impl GpuCuller {
 
         material_key_ranges.insert(current_material_key, current_material_key_range_start..draw_calls.len());
 
-        let mut cpass = ctx
-            .encoder_or_pass
-            .take_encoder()
-            .begin_compute_pass(&ComputePassDescriptor {
-                label: Some(&format_sso!("GpuCuller {type_name} Culling")),
-                timestamp_writes: None,
-            });
-        cpass.set_pipeline(&self.pipeline);
+        encoder.clear_buffer(&buffers.draw_call_buffer, 8, None);
+        let mut cpass = encoder.begin_compute_pass(&ComputePassDescriptor {
+            label: Some(&format_sso!("GpuCuller {type_name} Culling")),
+            timestamp_writes: None,
+        });
+
+        cpass.set_pipeline(&self.culling_pipeline);
         for (idx, job) in jobs.jobs.iter().enumerate() {
-            cpass.set_bind_group(0, &bg, &[idx as u32 * ShaderBatchData::SHADER_SIZE.get() as u32]);
-            cpass.dispatch_workgroups(round_up_div(job.total_invocations, WORKGROUP_SIZE), 1, 1);
+            // RA can't infer this
+            let job: &ShaderBatchData = job;
+
+            cpass.set_bind_group(
+                0,
+                &culling_bg,
+                &[idx as u32 * ShaderBatchData::SHADER_SIZE.get() as u32],
+            );
+            cpass.dispatch_workgroups(job.total_invocations.div_round_up(WORKGROUP_SIZE), 1, 1);
         }
         drop(cpass);
 
         DrawCallSet {
-            buffers,
+            culling_data_buffer,
+            per_camera_uniform,
             draw_calls,
             material_key_ranges,
         }
     }
 
+    pub fn add_object_uniform_upload_to_graph<'node, M: Material>(
+        &'node self,
+        graph: &mut RenderGraph<'node>,
+        camera_idx: CameraIndex,
+        resolution: UVec2,
+        samples: SampleCount,
+        name: &str,
+    ) {
+        let mut node = graph.add_node(name);
+        node.add_side_effect();
+
+        node.build(move |mut ctx| {
+            let camera = match camera_idx {
+                CameraIndex::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera,
+                CameraIndex::Viewport => &ctx.data_core.camera_manager,
+            };
+
+            self.object_uniform_upload::<M>(&mut ctx, camera, camera_idx, resolution, samples);
+        });
+    }
+
     pub fn add_culling_to_graph<'node, M: Material>(
         &'node self,
         graph: &mut RenderGraph<'node>,
-        draw_calls_hdl: DataHandle<DrawCallSet>,
-        camera: Option<usize>,
+        draw_calls_hdl: DataHandle<Arc<DrawCallSet>>,
+        depth_handle: RenderTargetHandle,
+        camera_idx: CameraIndex,
         name: &str,
     ) {
         let mut node = graph.add_node(name);
         let output = node.add_data(draw_calls_hdl, NodeResourceUsage::Output);
+        let depth_handle = node.add_render_target(depth_handle, NodeResourceUsage::Input);
 
         node.build(move |mut ctx| {
-            let camera_manager = match camera {
-                Some(i) => &ctx.eval_output.shadows[i].camera,
-                None => &ctx.data_core.camera_manager,
+            let camera = match camera_idx {
+                CameraIndex::Shadow(i) => &ctx.eval_output.shadows[i as usize].camera,
+                CameraIndex::Viewport => &ctx.data_core.camera_manager,
             };
 
-            let jobs = batch_objects::<M>(
-                &ctx.data_core.material_manager,
-                &ctx.data_core.object_manager,
-                camera_manager,
-                ctx.renderer.limits.max_compute_workgroups_per_dimension,
-            );
+            let jobs = batch_objects::<M>(&mut ctx, &self.previous_invocation_map_handle, camera, camera_idx);
 
             if jobs.jobs.is_empty() {
                 return;
             }
 
-            let draw_calls = self.cull::<M>(&mut ctx, jobs, camera);
+            let draw_calls = self.cull::<M>(&mut ctx, jobs, depth_handle, camera_idx);
 
-            ctx.graph_data.set_data(output, Some(draw_calls));
+            ctx.graph_data.set_data(output, Some(Arc::new(draw_calls)));
         });
     }
 }
diff --git a/rend3-routine/src/culling/mod.rs b/rend3-routine/src/culling/mod.rs
index ce90aa48..805aa218 100644
--- a/rend3-routine/src/culling/mod.rs
+++ b/rend3-routine/src/culling/mod.rs
@@ -1,8 +1,10 @@
 const BATCH_SIZE: usize = 256;
-const WORKGROUP_SIZE: u32 = 256;
+const WORKGROUP_SIZE: u32 = 64;
 
 mod batching;
 mod culler;
+mod suballoc;
 
 pub use batching::{ShaderBatchData, ShaderBatchDatas};
-pub use culler::{DrawCall, DrawCallSet, GpuCuller};
+pub use culler::{CullingBufferMap, DrawCall, DrawCallSet, GpuCuller};
+pub use suballoc::{InputOutputBuffer, InputOutputPartition};
diff --git a/rend3-routine/src/culling/suballoc.rs b/rend3-routine/src/culling/suballoc.rs
new file mode 100644
index 00000000..467b2c2d
--- /dev/null
+++ b/rend3-routine/src/culling/suballoc.rs
@@ -0,0 +1,227 @@
+use std::{
+    ops::{Deref, Range},
+    sync::Arc,
+};
+
+use encase::{internal::WriteInto, ShaderType, StorageBuffer};
+use rend3::util::{math::IntegerExt, typedefs::SsoString};
+use wgpu::CommandEncoder;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum InputOutputPartition {
+    Input,
+    Output,
+}
+
+#[derive(Debug)]
+pub struct InputOutputBuffer {
+    /// Label for the buffer
+    label: SsoString,
+    /// Current active buffer
+    buffer: Arc<wgpu::Buffer>,
+    /// Amount of elements reserved in the buffer for data, not including the header.
+    capacity_elements: u64,
+    /// Size of output partition
+    output_partition_elements: u64,
+    /// Size of input partition
+    input_partition_elements: u64,
+    /// When false, output partition is comes first.
+    ///
+    /// When true, input partition comes first.
+    flipped: bool,
+    /// Clear on swap
+    ///
+    /// When true, the data in both partitions will be cleared when the buffer
+    /// is swapped.
+    clear_on_swap: bool,
+    /// The size of each element in the buffer. This allows the user to provide sizes in element counts only.
+    ///
+    /// Must be a multiple of `element_alignment`.
+    element_size: u64,
+    /// Size of the header, including padding.
+    padded_header_size: u64,
+}
+
+impl Deref for InputOutputBuffer {
+    type Target = Arc<wgpu::Buffer>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.buffer
+    }
+}
+
+impl InputOutputBuffer {
+    const USAGES: wgpu::BufferUsages = wgpu::BufferUsages::STORAGE
+        .union(wgpu::BufferUsages::COPY_DST)
+        .union(wgpu::BufferUsages::COPY_SRC)
+        .union(wgpu::BufferUsages::INDEX)
+        .union(wgpu::BufferUsages::INDIRECT);
+
+    // The size of the header, including padding
+    fn padded_header_size(element_alignment: u64) -> u64 {
+        const HEADER_SIZE: u64 = 8;
+        HEADER_SIZE.round_up(element_alignment)
+    }
+
+    fn capacity_elements(input_partition_elements: u64, output_partition_elements: u64) -> u64 {
+        let max = input_partition_elements.max(output_partition_elements);
+        max.next_power_of_two() * 2
+    }
+
+    fn buffer_size(padded_header_size: u64, capacity_elements: u64, element_size: u64) -> u64 {
+        capacity_elements * element_size + padded_header_size
+    }
+
+    pub fn new(
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        partition_elements: u64,
+        label: &str,
+        element_size: u64,
+        element_alignment: u64,
+        clear_on_swap: bool,
+    ) -> Self {
+        let element_size = element_size.round_up(element_alignment);
+        let capacity_elements = Self::capacity_elements(partition_elements, partition_elements);
+        let padded_header_size = Self::padded_header_size(element_alignment);
+        let buffer_length = Self::buffer_size(padded_header_size, capacity_elements, element_size);
+
+        let buffer = Arc::new(device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some(label),
+            size: buffer_length,
+            usage: Self::USAGES,
+            mapped_at_creation: false,
+        }));
+
+        let this = Self {
+            label: SsoString::from(label),
+            buffer,
+            capacity_elements,
+            output_partition_elements: partition_elements,
+            input_partition_elements: partition_elements,
+            flipped: false,
+            clear_on_swap,
+            element_size,
+            padded_header_size,
+        };
+
+        this.write_headers(queue);
+
+        this
+    }
+
+    /// Returns the offset in bytes for a given element in the given partition
+    pub fn element_offset(&self, partition: InputOutputPartition, element: u64) -> u64 {
+        let partition_offset = match partition {
+            InputOutputPartition::Input => self.input_partition_offset(),
+            InputOutputPartition::Output => self.output_partition_offset(),
+        };
+        self.padded_header_size + partition_offset + element * self.element_size
+    }
+
+    pub fn partition_slice(&self, partition: InputOutputPartition) -> Range<u64> {
+        let partition_offset = match partition {
+            InputOutputPartition::Input => self.input_partition_offset(),
+            InputOutputPartition::Output => self.output_partition_offset(),
+        };
+        let partition_elements = match partition {
+            InputOutputPartition::Input => self.input_partition_elements,
+            InputOutputPartition::Output => self.output_partition_elements,
+        };
+        let partition_size = partition_elements * self.element_size;
+        let slice_start = self.padded_header_size + partition_offset;
+        let slice_end: u64 = slice_start + partition_size;
+        slice_start..slice_end
+    }
+
+    pub fn write_to_output<T: ShaderType + WriteInto>(&self, queue: &wgpu::Queue, data: &T) {
+        assert_eq!(data.size().get(), self.output_partition_elements * self.element_size);
+        let mut mapping = queue
+            .write_buffer_with(
+                &self.buffer,
+                self.element_offset(InputOutputPartition::Output, 0),
+                data.size(),
+            )
+            .unwrap();
+        StorageBuffer::new(&mut *mapping).write(data).unwrap();
+        drop(mapping);
+    }
+
+    /// Returns the offset in bytes to get to the start of the output partition, not including the header.
+    fn output_partition_offset(&self) -> u64 {
+        if self.flipped {
+            (self.capacity_elements * self.element_size) / 2
+        } else {
+            0
+        }
+    }
+
+    /// Returns the offset in bytes to get to the start of the input partition, not including the header.
+    fn input_partition_offset(&self) -> u64 {
+        if self.flipped {
+            0
+        } else {
+            (self.capacity_elements * self.element_size) / 2
+        }
+    }
+
+    pub fn swap(
+        &mut self,
+        queue: &wgpu::Queue,
+        device: &wgpu::Device,
+        encoder: &mut CommandEncoder,
+        new_partition_elements: u64,
+    ) {
+        // Offset of the output partition in the old buffer.
+        let old_output_partition_offset = self.output_partition_offset();
+
+        // The output of last frame is now the input of this frame.
+        self.input_partition_elements = self.output_partition_elements;
+        // The new output is of the given size.
+        self.output_partition_elements = new_partition_elements;
+        // We're now flipped.
+        self.flipped = !self.flipped;
+
+        // Gather a new data capcity
+        let new_capacity_elements =
+            Self::capacity_elements(self.input_partition_elements, self.output_partition_elements);
+
+        if new_capacity_elements != self.capacity_elements {
+            // Set the capacity reserved
+            self.capacity_elements = new_capacity_elements;
+            let new_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(&self.label),
+                size: Self::buffer_size(self.padded_header_size, new_capacity_elements, self.element_size),
+                usage: Self::USAGES,
+                mapped_at_creation: false,
+            });
+            if !self.clear_on_swap {
+                // We copy the old output partition to the input partition of the new buffer.
+                //
+                // Note that we call output_partition_offset before we change any internal parameters,
+                // as we need the old buffer offsets.
+                encoder.copy_buffer_to_buffer(
+                    &self.buffer,
+                    old_output_partition_offset + self.padded_header_size,
+                    &new_buffer,
+                    self.input_partition_offset() + self.padded_header_size,
+                    self.input_partition_elements * self.element_size,
+                );
+            }
+            // We now set the new buffer.
+            self.buffer = Arc::new(new_buffer);
+        } else if self.clear_on_swap {
+            encoder.clear_buffer(&self.buffer, self.padded_header_size, None);
+        }
+
+        self.write_headers(queue)
+    }
+
+    fn write_headers(&self, queue: &wgpu::Queue) {
+        let offsets = [
+            (self.output_partition_offset() / self.element_size) as u32,
+            (self.input_partition_offset() / self.element_size) as u32,
+        ];
+        queue.write_buffer(&self.buffer, 0, bytemuck::cast_slice(&offsets));
+    }
+}
diff --git a/rend3-routine/src/forward.rs b/rend3-routine/src/forward.rs
index 92711ac9..c33fa69c 100644
--- a/rend3-routine/src/forward.rs
+++ b/rend3-routine/src/forward.rs
@@ -2,7 +2,7 @@
 //!
 //! Will default to the PBR shader code if custom code is not specified.
 
-use std::marker::PhantomData;
+use std::{marker::PhantomData, sync::Arc};
 
 use arrayvec::ArrayVec;
 use encase::ShaderSize;
@@ -11,21 +11,21 @@ use rend3::{
         DataHandle, NodeResourceUsage, RenderGraph, RenderPassDepthTarget, RenderPassTarget, RenderPassTargets,
         RenderTargetHandle,
     },
-    types::{Handedness, Material, SampleCount},
-    util::bind_merge::BindGroupBuilder,
+    types::{GraphDataHandle, Material, SampleCount},
+    util::{bind_merge::BindGroupBuilder, typedefs::FastHashMap},
     ProfileData, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor,
 };
 use serde::Serialize;
 use wgpu::{
     BindGroup, BindGroupLayout, Color, ColorTargetState, ColorWrites, CompareFunction, DepthBiasState,
-    DepthStencilState, Face, FragmentState, FrontFace, IndexFormat, MultisampleState, PipelineLayoutDescriptor,
-    PolygonMode, PrimitiveState, PrimitiveTopology, RenderPipeline, RenderPipelineDescriptor, ShaderModule,
-    StencilState, TextureFormat, VertexState,
+    DepthStencilState, FragmentState, IndexFormat, MultisampleState, PipelineLayoutDescriptor, PolygonMode,
+    PrimitiveState, PrimitiveTopology, RenderPipeline, RenderPipelineDescriptor, ShaderModule, StencilState,
+    TextureFormat, VertexState,
 };
 
 use crate::{
-    common::{PerMaterialArchetypeInterface, WholeFrameInterfaces},
-    culling::{self, DrawCall},
+    common::{CameraIndex, PerMaterialArchetypeInterface, WholeFrameInterfaces},
+    culling::{self, CullingBufferMap, DrawCall, DrawCallSet, InputOutputPartition},
 };
 
 #[derive(Serialize)]
@@ -50,7 +50,7 @@ pub struct ShaderModulePair<'a> {
 pub struct RoutineArgs<'a, M> {
     pub name: &'a str,
 
-    pub renderer: &'a Renderer,
+    pub renderer: &'a Arc<Renderer>,
     pub data_core: &'a mut RendererDataCore,
     pub spp: &'a ShaderPreProcessor,
 
@@ -61,6 +61,8 @@ pub struct RoutineArgs<'a, M> {
     pub routine_type: RoutineType,
     pub shaders: ShaderModulePair<'a>,
 
+    pub culling_buffer_map_handle: GraphDataHandle<CullingBufferMap>,
+
     pub extra_bgls: &'a [&'a BindGroupLayout],
     #[allow(clippy::type_complexity)]
     pub descriptor_callback: Option<&'a dyn Fn(&mut RenderPipelineDescriptor<'_>, &mut [Option<ColorTargetState>])>,
@@ -69,7 +71,10 @@ pub struct RoutineArgs<'a, M> {
 pub struct RoutineAddToGraphArgs<'a, 'node, M> {
     pub graph: &'a mut RenderGraph<'node>,
     pub whole_frame_uniform_bg: DataHandle<BindGroup>,
-    pub culled: DataHandle<culling::DrawCallSet>,
+    // If this is None, we are rendering the first pass with the predicted triangles from last frame.
+    //
+    // If this is Some, we are rendering the second pass with the residual triangles from this frame.
+    pub culling_output_handle: Option<DataHandle<Arc<culling::DrawCallSet>>>,
     pub per_material: &'node PerMaterialArchetypeInterface<M>,
     pub extra_bgs: Option<&'node [BindGroup]>,
     pub label: &'a str,
@@ -77,8 +82,7 @@ pub struct RoutineAddToGraphArgs<'a, 'node, M> {
     pub color: Option<RenderTargetHandle>,
     pub resolve: Option<RenderTargetHandle>,
     pub depth: RenderTargetHandle,
-    /// Passed to the shader through the instance index.
-    pub data: u32,
+    pub camera: CameraIndex,
 }
 
 /// A set of pipelines for rendering a specific combination of a material.
@@ -86,6 +90,8 @@ pub struct ForwardRoutine<M: Material> {
     pub pipeline_s1: RenderPipeline,
     pub pipeline_s4: RenderPipeline,
     pub material_key: u64,
+    pub culling_buffer_map_handle: GraphDataHandle<CullingBufferMap>,
+    pub draw_call_set_cache_handle: GraphDataHandle<FastHashMap<CameraIndex, Arc<DrawCallSet>>>,
     pub _phantom: PhantomData<M>,
 }
 impl<M: Material> ForwardRoutine<M> {
@@ -130,12 +136,13 @@ impl<M: Material> ForwardRoutine<M> {
             pipeline_s1: build_forward_pipeline_inner(&pll, &args, SampleCount::One),
             pipeline_s4: build_forward_pipeline_inner(&pll, &args, SampleCount::Four),
             material_key: args.material_key,
+            draw_call_set_cache_handle: args.renderer.add_graph_data(FastHashMap::default()),
+            culling_buffer_map_handle: args.culling_buffer_map_handle,
             _phantom: PhantomData,
         }
     }
 
     /// Add the given routine to the graph with the given settings.
-    #[allow(clippy::too_many_arguments)]
     pub fn add_forward_to_graph<'node>(&'node self, args: RoutineAddToGraphArgs<'_, 'node, M>) {
         let mut builder = args.graph.add_node(args.label);
 
@@ -162,24 +169,73 @@ impl<M: Material> ForwardRoutine<M> {
         });
 
         let whole_frame_uniform_handle = builder.add_data(args.whole_frame_uniform_bg, NodeResourceUsage::Input);
-        let cull_handle = builder.add_data(args.culled, NodeResourceUsage::Input);
+        let culling_output_handle = builder.add_optional_data(args.culling_output_handle, NodeResourceUsage::Input);
 
         builder.build(move |mut ctx| {
             let rpass = ctx.encoder_or_pass.take_rpass(rpass_handle);
             let whole_frame_uniform_bg = ctx.graph_data.get_data(ctx.temps, whole_frame_uniform_handle).unwrap();
-            let culled = match ctx.graph_data.get_data(ctx.temps, cull_handle) {
-                Some(c) => c,
-                None => return,
+
+            // We need to store the draw call set in a cache so that next frame's predicted pass can use it.
+            let mut draw_call_set_cache = ctx.data_core.graph_storage.get_mut(&self.draw_call_set_cache_handle);
+
+            let draw_call_set = match culling_output_handle {
+                // If we are provided a culling output handle, we are rendering the second pass
+                // with the residual triangles from this frame.
+                Some(handle) => {
+                    // If there is no draw call set for this camera in the cache, there isn't actually anything to render.
+                    let Some(draw_call_set) = ctx.graph_data.get_data(ctx.temps, handle) else {
+                        return;
+                    };
+
+                    // As we're in the residual, we need to store the draw call set for the next frame.
+                    draw_call_set_cache.insert(args.camera, Arc::clone(draw_call_set));
+
+                    draw_call_set
+                }
+                // If we are not provided a culling output handle, this mean we are rendering the first pass
+                // with the predicted triangles from last frame.
+                None => {
+                    // If there is no draw call set for this camera in the cache, that means we have yet to actually render anything,
+                    // so either no objects yet exist, or we are in the first frame.
+                    let Some(draw_call_set) = draw_call_set_cache.get(&args.camera) else {
+                        return;
+                    };
+
+                    draw_call_set
+                }
+            };
+            let residual = culling_output_handle.is_some() && args.camera.is_viewport();
+
+            let culling_buffer_storage = ctx.data_core.graph_storage.get(&self.culling_buffer_map_handle);
+
+            // If there are no culling buffers in storage yet, we are in the first frame. We depend on culling
+            // to render anything, so just bail at this point.
+            let Some(culling_buffers) = culling_buffer_storage.get_buffers(args.camera) else {
+                return;
+            };
+
+            // We need to actually clone ownership of the underlying buffers and add them to renderpass temps,
+            // so we can use them in the renderpass.
+            let index_buffer = ctx.temps.add(Arc::clone(&culling_buffers.index_buffer));
+            let draw_call_buffer = ctx.temps.add(Arc::clone(&culling_buffers.draw_call_buffer));
+
+            // When we're rendering the residual data, we are post buffer flip. We want to be rendering using the
+            // "input" partition, as this is the partition that all same-frame data is in.
+            let partition = if residual {
+                InputOutputPartition::Input
+            } else {
+                InputOutputPartition::Output
             };
 
             let per_material_bg = ctx.temps.add(
                 BindGroupBuilder::new()
                     .append_buffer(ctx.data_core.object_manager.buffer::<M>().unwrap())
                     .append_buffer_with_size(
-                        &culled.buffers.object_reference,
+                        &draw_call_set.culling_data_buffer,
                         culling::ShaderBatchData::SHADER_SIZE.get(),
                     )
                     .append_buffer(&ctx.eval_output.mesh_buffer)
+                    .append_buffer(&draw_call_set.per_camera_uniform)
                     .append_buffer(ctx.data_core.material_manager.archetype_view::<M>().buffer())
                     .build(&ctx.renderer.device, Some("Per-Material BG"), &args.per_material.bgl),
             );
@@ -188,8 +244,10 @@ impl<M: Material> ForwardRoutine<M> {
                 SampleCount::One => &self.pipeline_s1,
                 SampleCount::Four => &self.pipeline_s4,
             };
-
-            rpass.set_index_buffer(culled.buffers.index.slice(..), IndexFormat::Uint32);
+            rpass.set_index_buffer(
+                index_buffer.slice(culling_buffers.index_buffer.partition_slice(partition)),
+                IndexFormat::Uint32,
+            );
             rpass.set_pipeline(pipeline);
             rpass.set_bind_group(0, whole_frame_uniform_bg, &[]);
             if let Some(v) = args.extra_bgs {
@@ -201,12 +259,18 @@ impl<M: Material> ForwardRoutine<M> {
                 rpass.set_bind_group(2, bg, &[]);
             }
 
-            let Some(range) = culled.material_key_ranges.get(&self.material_key) else {
+            // If there are no draw calls for this material, just bail.
+            let Some(range) = draw_call_set.material_key_ranges.get(&self.material_key) else {
                 return;
             };
-            for call in &culled.draw_calls[range.clone()] {
+
+            for (range_relative_idx, call) in draw_call_set.draw_calls[range.clone()].iter().enumerate() {
+                // Help RA out
                 let call: &DrawCall = call;
+                // Add the base of the range to the index to get the actual index
+                let idx = range_relative_idx + range.start;
 
+                // If we're in cpu driven mode, we need to update the texture bind group.
                 if ctx.renderer.profile.is_cpu_driven() {
                     rpass.set_bind_group(
                         2,
@@ -219,7 +283,10 @@ impl<M: Material> ForwardRoutine<M> {
                     per_material_bg,
                     &[call.batch_index * culling::ShaderBatchData::SHADER_SIZE.get() as u32],
                 );
-                rpass.draw_indexed(call.index_range.clone(), 0, args.data..args.data + 1);
+                rpass.draw_indexed_indirect(
+                    draw_call_buffer,
+                    culling_buffers.draw_call_buffer.element_offset(partition, idx as u64),
+                );
             }
         });
     }
@@ -249,11 +316,11 @@ fn build_forward_pipeline_inner<M: Material>(
         primitive: PrimitiveState {
             topology: PrimitiveTopology::TriangleList,
             strip_index_format: None,
-            front_face: match args.renderer.handedness {
-                Handedness::Left => FrontFace::Cw,
-                Handedness::Right => FrontFace::Ccw,
-            },
-            cull_mode: Some(Face::Back),
+            front_face: args.renderer.handedness.into(),
+            cull_mode: Some(match args.routine_type {
+                RoutineType::Depth => wgpu::Face::Front,
+                RoutineType::Forward => wgpu::Face::Back,
+            }),
             unclipped_depth: false,
             polygon_mode: PolygonMode::Fill,
             conservative: false,
@@ -264,9 +331,10 @@ fn build_forward_pipeline_inner<M: Material>(
             depth_compare: CompareFunction::GreaterEqual,
             stencil: StencilState::default(),
             bias: match args.routine_type {
+                // TODO: figure out what to put here
                 RoutineType::Depth => DepthBiasState {
-                    constant: -2,
-                    slope_scale: -2.0,
+                    constant: 0,
+                    slope_scale: 0.0,
                     clamp: 0.0,
                 },
                 RoutineType::Forward => DepthBiasState::default(),
diff --git a/rend3-routine/src/hi_z.rs b/rend3-routine/src/hi_z.rs
new file mode 100644
index 00000000..39f6d0b2
--- /dev/null
+++ b/rend3-routine/src/hi_z.rs
@@ -0,0 +1,276 @@
+use std::borrow::Cow;
+
+use glam::UVec2;
+use rend3::{
+    graph::{
+        DeclaredDependency, NodeExecutionContext, NodeResourceUsage, RenderGraph, RenderPassDepthTarget,
+        RenderPassHandle, RenderPassTargets, RenderTargetHandle, ViewportRect,
+    },
+    Renderer, ShaderPreProcessor,
+};
+use wgpu::{
+    BindGroupDescriptor, BindGroupEntry, BindGroupLayout, BindGroupLayoutDescriptor, BindGroupLayoutEntry,
+    BindingResource, BindingType, CompareFunction, DepthBiasState, DepthStencilState, Extent3d, FragmentState,
+    MultisampleState, PipelineLayoutDescriptor, PrimitiveState, RenderPipeline, RenderPipelineDescriptor,
+    ShaderModuleDescriptor, ShaderStages, StencilState, TextureDimension, TextureFormat, TextureSampleType,
+    TextureViewDimension, VertexState,
+};
+
+use crate::base::DepthTargets;
+
+pub struct HiZRoutine {
+    multisampled_bgl: BindGroupLayout,
+    single_sampled_bgl: BindGroupLayout,
+    downscale_pipeline: RenderPipeline,
+    resolve_pipeline: RenderPipeline,
+}
+
+impl HiZRoutine {
+    pub fn new(renderer: &Renderer, spp: &ShaderPreProcessor) -> Self {
+        let resolve_source = spp
+            .render_shader(
+                "rend3-routine/resolve_depth_min.wgsl",
+                &serde_json::json!({"SAMPLES": 4}),
+                None,
+            )
+            .unwrap();
+        let downscale_source = spp.render_shader("rend3-routine/hi_z.wgsl", &(), None).unwrap();
+
+        let resolve_sm = renderer.device.create_shader_module(ShaderModuleDescriptor {
+            label: Some("HiZ Resolver"),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(resolve_source)),
+        });
+        let downscale_sm = renderer.device.create_shader_module(ShaderModuleDescriptor {
+            label: Some("HiZ Downscaler"),
+            source: wgpu::ShaderSource::Wgsl(Cow::Owned(downscale_source)),
+        });
+
+        let multisampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor {
+            label: Some("Multi Sample HiZ Texture BGL"),
+            entries: &[BindGroupLayoutEntry {
+                binding: 0,
+                visibility: ShaderStages::VERTEX_FRAGMENT,
+                ty: BindingType::Texture {
+                    sample_type: TextureSampleType::Depth,
+                    view_dimension: TextureViewDimension::D2,
+                    multisampled: true,
+                },
+                count: None,
+            }],
+        });
+
+        let single_sampled_bgl = renderer.device.create_bind_group_layout(&BindGroupLayoutDescriptor {
+            label: Some("Single Sample HiZ Texture BGL"),
+            entries: &[BindGroupLayoutEntry {
+                binding: 0,
+                visibility: ShaderStages::VERTEX_FRAGMENT,
+                ty: BindingType::Texture {
+                    sample_type: TextureSampleType::Depth,
+                    view_dimension: TextureViewDimension::D2,
+                    multisampled: false,
+                },
+                count: None,
+            }],
+        });
+
+        let resolve_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor {
+            label: Some("HiZ Resolve PLL"),
+            bind_group_layouts: &[&multisampled_bgl],
+            push_constant_ranges: &[],
+        });
+
+        let downscale_pipline_layout = renderer.device.create_pipeline_layout(&PipelineLayoutDescriptor {
+            label: Some("HiZ Downscale PLL"),
+            bind_group_layouts: &[&single_sampled_bgl],
+            push_constant_ranges: &[],
+        });
+
+        let resolve_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor {
+            label: Some("HiZ Resolve Pipeline"),
+            layout: Some(&resolve_pipline_layout),
+            vertex: VertexState {
+                module: &resolve_sm,
+                entry_point: "vs_main",
+                buffers: &[],
+            },
+            primitive: PrimitiveState::default(),
+            depth_stencil: Some(DepthStencilState {
+                format: TextureFormat::Depth32Float,
+                depth_write_enabled: true,
+                depth_compare: CompareFunction::Always,
+                stencil: StencilState::default(),
+                bias: DepthBiasState::default(),
+            }),
+            multisample: MultisampleState::default(),
+            fragment: Some(FragmentState {
+                module: &resolve_sm,
+                entry_point: "fs_main",
+                targets: &[],
+            }),
+            multiview: None,
+        });
+
+        let downscale_pipeline = renderer.device.create_render_pipeline(&RenderPipelineDescriptor {
+            label: Some("HiZ Downscale Pipeline"),
+            layout: Some(&downscale_pipline_layout),
+            vertex: VertexState {
+                module: &downscale_sm,
+                entry_point: "vs_main",
+                buffers: &[],
+            },
+            primitive: PrimitiveState::default(),
+            depth_stencil: Some(DepthStencilState {
+                format: TextureFormat::Depth32Float,
+                depth_write_enabled: true,
+                depth_compare: CompareFunction::Always,
+                stencil: StencilState::default(),
+                bias: DepthBiasState::default(),
+            }),
+            multisample: MultisampleState::default(),
+            fragment: Some(FragmentState {
+                module: &downscale_sm,
+                entry_point: "fs_main",
+                targets: &[],
+            }),
+            multiview: None,
+        });
+
+        Self {
+            single_sampled_bgl,
+            downscale_pipeline,
+            multisampled_bgl,
+            resolve_pipeline,
+        }
+    }
+
+    pub fn resolve<'pass>(
+        &'pass self,
+        mut ctx: NodeExecutionContext<'_, 'pass, '_>,
+        renderpass_handle: DeclaredDependency<RenderPassHandle>,
+        source_handle: DeclaredDependency<RenderTargetHandle>,
+    ) {
+        let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle);
+        let source = ctx.graph_data.get_render_target(source_handle);
+
+        let bind_group = ctx
+            .temps
+            .add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor {
+                label: Some("HiZ Resolve BG"),
+                layout: &self.multisampled_bgl,
+                entries: &[BindGroupEntry {
+                    binding: 0,
+                    resource: BindingResource::TextureView(source),
+                }],
+            }));
+
+        rpass.set_pipeline(&self.resolve_pipeline);
+        rpass.set_bind_group(0, bind_group, &[]);
+        rpass.draw(0..3, 0..1);
+    }
+
+    pub fn downscale<'pass>(
+        &'pass self,
+        mut ctx: NodeExecutionContext<'_, 'pass, '_>,
+        renderpass_handle: DeclaredDependency<RenderPassHandle>,
+        source_handle: DeclaredDependency<RenderTargetHandle>,
+    ) {
+        let rpass = ctx.encoder_or_pass.take_rpass(renderpass_handle);
+        let source = ctx.graph_data.get_render_target(source_handle);
+
+        let bind_group = ctx
+            .temps
+            .add(ctx.renderer.device.create_bind_group(&BindGroupDescriptor {
+                label: Some("HiZ Bind Group Layout"),
+                layout: &self.single_sampled_bgl,
+                entries: &[BindGroupEntry {
+                    binding: 0,
+                    resource: BindingResource::TextureView(source),
+                }],
+            }));
+
+        rpass.set_pipeline(&self.downscale_pipeline);
+        rpass.set_bind_group(0, bind_group, &[]);
+        rpass.draw(0..3, 0..1);
+    }
+
+    pub fn add_hi_z_to_graph<'node>(
+        &'node self,
+        graph: &mut RenderGraph<'node>,
+        depth_targets: DepthTargets,
+        resolution: UVec2,
+    ) {
+        let extent = Extent3d {
+            width: resolution.x,
+            height: resolution.y,
+            depth_or_array_layers: 1,
+        };
+        let mips = extent.max_mips(TextureDimension::D2) as u8;
+
+        // First we need to downscale the depth buffer to a single sample texture
+        // if we are doing multisampling.
+        if let Some(multi_sample) = depth_targets.multi_sample {
+            let mut node = graph.add_node("HiZ Resolve");
+
+            let target = node.add_render_target(
+                depth_targets.single_sample_mipped.set_mips(0..1),
+                NodeResourceUsage::Output,
+            );
+
+            let source = node.add_render_target(multi_sample, NodeResourceUsage::Output);
+
+            let rpass_handle = node.add_renderpass(RenderPassTargets {
+                targets: vec![],
+                depth_stencil: Some(RenderPassDepthTarget {
+                    target,
+                    depth_clear: Some(0.0),
+                    stencil_clear: None,
+                }),
+            });
+
+            node.add_side_effect();
+
+            node.build(move |ctx| {
+                self.resolve(ctx, rpass_handle, source);
+            });
+        }
+
+        for dst_mip in 1..mips {
+            let src_mip = dst_mip - 1;
+
+            let mut node = graph.add_node(&format!("HiZ Mip {src_mip} -> {dst_mip}"));
+
+            let dst_extent = extent.mip_level_size(dst_mip as u32, TextureDimension::D2);
+            let src_extent = extent.mip_level_size(src_mip as u32, TextureDimension::D2);
+
+            let dst_target = node.add_render_target(
+                depth_targets
+                    .single_sample_mipped
+                    .set_mips(dst_mip..dst_mip + 1)
+                    .set_viewport(ViewportRect::from_size(UVec2::new(dst_extent.width, dst_extent.height))),
+                NodeResourceUsage::Output,
+            );
+            let src_target = node.add_render_target(
+                depth_targets
+                    .single_sample_mipped
+                    .set_mips(src_mip..src_mip + 1)
+                    .set_viewport(ViewportRect::from_size(UVec2::new(src_extent.width, src_extent.height))),
+                NodeResourceUsage::Input,
+            );
+
+            let rpass_handle = node.add_renderpass(RenderPassTargets {
+                targets: vec![],
+                depth_stencil: Some(RenderPassDepthTarget {
+                    target: dst_target,
+                    depth_clear: Some(0.0),
+                    stencil_clear: None,
+                }),
+            });
+
+            node.add_side_effect();
+
+            node.build(move |ctx| {
+                self.downscale(ctx, rpass_handle, src_target);
+            });
+        }
+    }
+}
diff --git a/rend3-routine/src/lib.rs b/rend3-routine/src/lib.rs
index b09cbc4d..9f13361b 100644
--- a/rend3-routine/src/lib.rs
+++ b/rend3-routine/src/lib.rs
@@ -23,6 +23,7 @@ pub mod clear;
 pub mod common;
 pub mod culling;
 pub mod forward;
+pub mod hi_z;
 pub mod pbr;
 mod shaders;
 pub mod skinning;
diff --git a/rend3-routine/src/pbr/routine.rs b/rend3-routine/src/pbr/routine.rs
index 36878e37..f54e7914 100644
--- a/rend3-routine/src/pbr/routine.rs
+++ b/rend3-routine/src/pbr/routine.rs
@@ -1,12 +1,16 @@
-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};
 
-use rend3::{Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig};
+use rend3::{
+    types::GraphDataHandle, Renderer, RendererDataCore, RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig,
+};
 use serde::Serialize;
 use wgpu::{BlendState, ShaderModuleDescriptor, ShaderSource};
 
 use crate::{
     common::{PerMaterialArchetypeInterface, WholeFrameInterfaces},
+    culling::CullingBufferMap,
     forward::{ForwardRoutine, RoutineArgs, RoutineType, ShaderModulePair},
+    hi_z::HiZRoutine,
     pbr::{PbrMaterial, TransparencyType},
 };
 
@@ -23,15 +27,17 @@ pub struct PbrRoutine {
     pub opaque_routine: ForwardRoutine<PbrMaterial>,
     pub cutout_routine: ForwardRoutine<PbrMaterial>,
     pub blend_routine: ForwardRoutine<PbrMaterial>,
+    pub hi_z: HiZRoutine,
     pub per_material: PerMaterialArchetypeInterface<PbrMaterial>,
 }
 
 impl PbrRoutine {
     pub fn new(
-        renderer: &Renderer,
+        renderer: &Arc<Renderer>,
         data_core: &mut RendererDataCore,
         spp: &ShaderPreProcessor,
         interfaces: &WholeFrameInterfaces,
+        culling_buffer_map_handle: &GraphDataHandle<CullingBufferMap>,
     ) -> Self {
         profiling::scope!("PbrRenderRoutine::new");
 
@@ -125,6 +131,7 @@ impl PbrRoutine {
                         targets[0].as_mut().unwrap().blend = Some(BlendState::ALPHA_BLENDING)
                     }
                 }),
+                culling_buffer_map_handle: culling_buffer_map_handle.clone(),
             })
         };
 
@@ -134,6 +141,7 @@ impl PbrRoutine {
             opaque_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Opaque),
             cutout_routine: inner(RoutineType::Forward, &pbr_cutout, TransparencyType::Cutout),
             blend_routine: inner(RoutineType::Forward, &pbr_forward, TransparencyType::Blend),
+            hi_z: HiZRoutine::new(renderer, spp),
             per_material,
         }
     }
diff --git a/rend3-routine/src/shaders.rs b/rend3-routine/src/shaders.rs
index e59ac7db..89f1ac85 100644
--- a/rend3-routine/src/shaders.rs
+++ b/rend3-routine/src/shaders.rs
@@ -23,7 +23,8 @@ mod tests {
         },
     };
     use naga::WithSpan;
-    use rend3::{RendererProfile, ShaderConfig, ShaderPreProcessor, ShaderVertexBufferConfig};
+    use rend3::{RendererProfile, ShaderPreProcessor, ShaderVertexBufferConfig};
+    use serde_json::json;
 
     use crate::{pbr::PbrMaterial, shaders::Rend3RoutineShaderSources};
 
@@ -70,16 +71,24 @@ mod tests {
 
             let source = pp.get(shader).unwrap();
             let configs = if source.contains("#if") {
-                &[
-                    ShaderConfig {
-                        profile: Some(RendererProfile::CpuDriven),
-                    },
-                    ShaderConfig {
-                        profile: Some(RendererProfile::GpuDriven),
-                    },
-                ][..]
+                vec![
+                    json!({
+                        "profile": Some(RendererProfile::GpuDriven),
+                        "position_attribute_offset": 0,
+                        "SAMPLES": 1,
+                    }),
+                    json!({
+                        "profile": Some(RendererProfile::CpuDriven),
+                        "position_attribute_offset": 0,
+                        "SAMPLES": 1,
+                    }),
+                ]
             } else {
-                &[ShaderConfig { profile: None }][..]
+                vec![json!({
+                    "profile": Some(RendererProfile::CpuDriven),
+                    "position_attribute_offset": 0,
+                    "SAMPLES": 1,
+                })]
             };
 
             if source.contains("DO NOT VALIDATE") {
@@ -87,12 +96,11 @@ mod tests {
             }
 
             for config in configs {
-                let serialized_config = serde_json::to_value(config).unwrap();
-                println!("Testing shader {shader} with config {serialized_config:?}");
+                println!("Testing shader {shader} with config {config:?}");
 
                 let output = pp.render_shader(
                     shader,
-                    config,
+                    &config,
                     Some(&ShaderVertexBufferConfig::from_material::<PbrMaterial>()),
                 );
 
diff --git a/rend3-routine/src/skinning.rs b/rend3-routine/src/skinning.rs
index 3990338e..e15d2513 100644
--- a/rend3-routine/src/skinning.rs
+++ b/rend3-routine/src/skinning.rs
@@ -10,7 +10,7 @@ use rend3::{
     },
     util::{
         bind_merge::{BindGroupBuilder, BindGroupLayoutBuilder},
-        math::round_up_div,
+        math::div_round_up,
     },
     ShaderPreProcessor,
 };
@@ -199,7 +199,7 @@ impl GpuSkinner {
             let offset = (i as u64 * GpuSkinningInput::SHADER_SIZE.get()) as u32;
             cpass.set_bind_group(0, &bg, &[offset]);
 
-            let num_workgroups = round_up_div(skel.vertex_count, Self::WORKGROUP_SIZE);
+            let num_workgroups = div_round_up(skel.vertex_count, Self::WORKGROUP_SIZE);
             cpass.dispatch_workgroups(num_workgroups, 1, 1);
         }
     }
diff --git a/rend3-test/Cargo.toml b/rend3-test/Cargo.toml
index 6dddcd74..8ce5668b 100644
--- a/rend3-test/Cargo.toml
+++ b/rend3-test/Cargo.toml
@@ -15,6 +15,7 @@ name = "rend3-tests"
 
 [dependencies]
 anyhow = "1"
+env_logger = "0.10"
 flume = { version = "0.11", features = ["spin"] }
 glam = "0.24"
 image = { version =  "0.24", default-features = false, features = ["png"] }
diff --git a/rend3-test/src/helpers.rs b/rend3-test/src/helpers.rs
index c56a6627..59dad809 100644
--- a/rend3-test/src/helpers.rs
+++ b/rend3-test/src/helpers.rs
@@ -10,6 +10,12 @@ use crate::TestRunner;
 pub struct CaptureDropGuard {
     device: Arc<Device>,
 }
+impl CaptureDropGuard {
+    pub fn start_capture(device: Arc<Device>) -> Self {
+        device.start_capture();
+        Self { device }
+    }
+}
 impl Drop for CaptureDropGuard {
     fn drop(&mut self) {
         self.device.stop_capture();
@@ -19,13 +25,6 @@ impl Drop for CaptureDropGuard {
 }
 
 impl TestRunner {
-    pub fn start_capture(&self) -> CaptureDropGuard {
-        self.device.start_capture();
-        CaptureDropGuard {
-            device: self.device.clone(),
-        }
-    }
-
     pub fn add_directional_light(&self, direction: Vec3) -> DirectionalLightHandle {
         self.renderer.add_directional_light(rend3::types::DirectionalLight {
             color: glam::Vec3::ONE,
diff --git a/rend3-test/src/lib.rs b/rend3-test/src/lib.rs
index 3c286d78..1438b668 100644
--- a/rend3-test/src/lib.rs
+++ b/rend3-test/src/lib.rs
@@ -1,7 +1,9 @@
 mod helpers;
 mod runner;
+mod threshold;
 
 pub use runner::{FrameRenderSettings, TestRunner};
+pub use threshold::{Threshold, ThresholdSet};
 
 #[macro_export]
 macro_rules! no_gpu_return {
diff --git a/rend3-test/src/runner.rs b/rend3-test/src/runner.rs
index e5f21856..b5ddd5f3 100644
--- a/rend3-test/src/runner.rs
+++ b/rend3-test/src/runner.rs
@@ -14,6 +14,8 @@ use wgpu::{
     Extent3d, ImageCopyBuffer, ImageDataLayout, TextureDescriptor, TextureDimension, TextureFormat, TextureUsages,
 };
 
+use crate::{helpers::CaptureDropGuard, ThresholdSet};
+
 #[derive(Clone)]
 pub struct FrameRenderSettings {
     size: u32,
@@ -68,6 +70,8 @@ impl TestRunnerBuilder {
     }
 
     pub async fn build(self) -> Result<TestRunner> {
+        let _ = env_logger::try_init();
+
         let iad = match self.iad {
             Some(iad) => iad,
             None => rend3::create_iad(None, None, None, None)
@@ -75,6 +79,8 @@ impl TestRunnerBuilder {
                 .context("InstanceAdapterDevice creation failed")?,
         };
 
+        let capture_guard = CaptureDropGuard::start_capture(Arc::clone(&iad.device));
+
         let renderer = rend3::Renderer::new(iad, self.handness.unwrap_or(Handedness::Left), None)
             .context("Renderer initialization failed")?;
         let mut spp = rend3::ShaderPreProcessor::new();
@@ -87,6 +93,7 @@ impl TestRunnerBuilder {
             &mut renderer.data_core.lock(),
             &spp,
             &base_rendergraph.interfaces,
+            &base_rendergraph.gpu_culler.culling_buffer_map_handle,
         );
         let tonemapping = TonemappingRoutine::new(
             &renderer,
@@ -100,6 +107,7 @@ impl TestRunnerBuilder {
             pbr,
             tonemapping,
             base_rendergraph,
+            capture_guard,
         })
     }
 }
@@ -109,6 +117,7 @@ pub struct TestRunner {
     pub pbr: PbrRoutine,
     pub tonemapping: TonemappingRoutine,
     pub base_rendergraph: BaseRenderGraph,
+    pub capture_guard: CaptureDropGuard,
 }
 
 impl Deref for TestRunner {
@@ -148,6 +157,7 @@ impl TestRunner {
         let frame_handle = graph.add_imported_render_target(
             &texture,
             0..1,
+            0..1,
             rend3::graph::ViewportRect::from_size(UVec2::splat(settings.size)),
         );
 
@@ -223,7 +233,12 @@ impl TestRunner {
             .context("Failed to create image from mapping")
     }
 
-    pub fn compare_image_to_path(&self, test_rgba: &image::RgbaImage, path: &Path, threshold: f32) -> Result<()> {
+    pub fn compare_image_to_path(
+        &self,
+        test_rgba: &image::RgbaImage,
+        path: &Path,
+        threshold: impl Into<ThresholdSet>,
+    ) -> Result<()> {
         #[cfg(not(target_arch = "wasm32"))]
         {
             let parent_path = path.parent().context("Path given had no parent")?;
@@ -249,11 +264,11 @@ impl TestRunner {
 
             let mut pool = nv_flip::FlipPool::from_image(&result_float);
 
-            let mean: f32 = pool.mean();
-
-            let pass = mean <= threshold;
+            println!("Image Comparison Results");
+            let threshold_set: ThresholdSet = threshold.into();
+            let pass = threshold_set.check(&mut pool);
 
-            println!("Image Comparison Results: {}", if pass { "passed" } else { "failed" });
+            println!();
             println!("    Mean: {}", pool.mean());
             println!("     Min: {}", pool.min_value());
             println!("     25%: {}", pool.get_percentile(0.25, true));
@@ -262,6 +277,8 @@ impl TestRunner {
             println!("     95%: {}", pool.get_percentile(0.95, true));
             println!("     99%: {}", pool.get_percentile(0.99, true));
             println!("     Max: {}", pool.max_value());
+            println!("{}", if pass { "Passed!" } else { "Failed!" });
+            println!();
 
             let filename = path.file_stem().unwrap();
 
@@ -286,7 +303,7 @@ impl TestRunner {
         &self,
         settings: FrameRenderSettings,
         path: impl AsRef<Path>,
-        threshold: f32,
+        threshold: impl Into<ThresholdSet>,
     ) -> Result<()> {
         let test_rgba = self.render_frame(settings).await?;
 
diff --git a/rend3-test/src/threshold.rs b/rend3-test/src/threshold.rs
new file mode 100644
index 00000000..e987faaf
--- /dev/null
+++ b/rend3-test/src/threshold.rs
@@ -0,0 +1,73 @@
+pub struct ThresholdSet {
+    #[cfg_attr(target_arch = "wasm32", allow(unused))]
+    thresholds: Vec<Threshold>,
+}
+
+impl ThresholdSet {
+    #[cfg(not(target_arch = "wasm32"))]
+    pub fn check(self, pool: &mut nv_flip::FlipPool) -> bool {
+        // If there are no checks, we want to fail the test.
+        let mut all_passed = !self.thresholds.is_empty();
+        // We always iterate all of these, as the call to check prints
+        for check in self.thresholds {
+            all_passed &= check.check(pool);
+        }
+        all_passed
+    }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub enum Threshold {
+    Mean(f32),
+    Percentile { percentile: f32, threshold: f32 },
+}
+
+impl Threshold {
+    #[cfg(not(target_arch = "wasm32"))]
+    fn check(&self, pool: &mut nv_flip::FlipPool) -> bool {
+        match *self {
+            Self::Mean(v) => {
+                let mean = pool.mean();
+                let within = mean <= v;
+                println!(
+                    "    Expected Mean ({:.6}) to be under expected maximum ({}): {}",
+                    mean,
+                    v,
+                    if within { "PASS" } else { "FAIL" }
+                );
+                within
+            }
+            Self::Percentile {
+                percentile: p,
+                threshold: v,
+            } => {
+                let percentile = pool.get_percentile(p, true);
+                let within = percentile <= v;
+                println!(
+                    "    Expected {}% ({:.6}) to be under expected maximum ({}): {}",
+                    p * 100.0,
+                    percentile,
+                    v,
+                    if within { "PASS" } else { "FAIL" }
+                );
+                within
+            }
+        }
+    }
+}
+
+impl From<Threshold> for ThresholdSet {
+    fn from(threshold: Threshold) -> Self {
+        Self {
+            thresholds: vec![threshold],
+        }
+    }
+}
+
+impl From<&[Threshold]> for ThresholdSet {
+    fn from(thresholds: &[Threshold]) -> Self {
+        Self {
+            thresholds: thresholds.into(),
+        }
+    }
+}
diff --git a/rend3-test/tests/msaa.rs b/rend3-test/tests/msaa.rs
index 6bff054b..5c5da7ff 100644
--- a/rend3-test/tests/msaa.rs
+++ b/rend3-test/tests/msaa.rs
@@ -1,7 +1,7 @@
 use anyhow::Context;
 use glam::{Mat4, Vec3, Vec4};
 use rend3::types::{Camera, Handedness, MeshBuilder, Object, ObjectMeshKind, SampleCount};
-use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner};
+use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold};
 
 #[test_attr]
 pub async fn triangle() -> anyhow::Result<()> {
@@ -47,7 +47,7 @@ pub async fn triangle() -> anyhow::Result<()> {
         .render_and_compare(
             FrameRenderSettings::new().samples(SampleCount::Four),
             "tests/results/msaa/four.png",
-            0.0,
+            Threshold::Mean(0.0),
         )
         .await?;
 
@@ -96,7 +96,7 @@ pub async fn sample_coverage() -> anyhow::Result<()> {
             .render_and_compare(
                 FrameRenderSettings::new().samples(samples),
                 &format!("tests/results/msaa/sample-coverage-{}.png", samples as u8),
-                0.0,
+                Threshold::Mean(0.0),
             )
             .await?;
     }
diff --git a/rend3-test/tests/object.rs b/rend3-test/tests/object.rs
new file mode 100644
index 00000000..dceabd12
--- /dev/null
+++ b/rend3-test/tests/object.rs
@@ -0,0 +1,61 @@
+use anyhow::Context;
+use glam::{Mat4, Vec3, Vec4};
+use rend3::{
+    types::{Camera, Handedness},
+    util::freelist::FreelistDerivedBuffer,
+};
+use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold};
+
+/// There was a bug in the culling implementation where the per-material buffer
+/// was never resized to fit the number of objects in the scene once it was initially
+/// created. This manifested as objects above the initial frame count would get all-zero
+/// transforms and be completely hidden. We reproduce those conditions here, and ensure
+/// that the bug is fixed.
+#[test_attr]
+pub async fn multi_frame_add() -> anyhow::Result<()> {
+    let iad = no_gpu_return!(rend3::create_iad(None, None, None, None).await)
+        .context("InstanceAdapterDevice creation failed")?;
+
+    let Ok(runner) = TestRunner::builder()
+        .iad(iad.clone())
+        .handedness(Handedness::Left)
+        .build()
+        .await
+    else {
+        return Ok(());
+    };
+
+    let material = runner.add_unlit_material(Vec4::ONE);
+
+    // Make a plane whose (0, 0) is at the top left, and is 1 unit large.
+    let base_matrix = Mat4::from_translation(Vec3::new(0.5, 0.5, 0.0)) * Mat4::from_scale(Vec3::new(0.5, 1.0, 1.0));
+
+    runner.set_camera_data(Camera {
+        projection: rend3::types::CameraProjection::Raw(Mat4::orthographic_lh(0.0, 2.0, 16.0, 0.0, 0.0, 1.0)),
+        view: Mat4::IDENTITY,
+    });
+
+    // We use the starting size amount of objects for each column, ensuring that the buffer
+    // will need to be resized on the second column.
+    let count = FreelistDerivedBuffer::STARTING_SIZE;
+
+    // 2 side by side columns made up of `count` rows
+    let mut planes = Vec::with_capacity(2);
+    for x in 0..2 {
+        for y in 0..count {
+            planes.push(runner.plane(
+                material.clone(),
+                Mat4::from_translation(Vec3::new(x as f32, y as f32, 0.0)) * base_matrix,
+            ));
+        }
+        runner
+            .render_and_compare(
+                FrameRenderSettings::new(),
+                &format!("tests/results/object/multi-frame-add-{}.png", x),
+                Threshold::Mean(0.0),
+            )
+            .await?;
+    }
+
+    Ok(())
+}
diff --git a/rend3-test/tests/results/object/multi-frame-add-0.png b/rend3-test/tests/results/object/multi-frame-add-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..9abacaa79d63abd2617654bd1ab62812a6598f09
GIT binary patch
literal 439
zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV65|WaSW+oeEWcLf{N!P6;Gk=
zRWo<HySSvKrG<q8K{*I~KSK*p0x~*09Arycnwy(T%kJ4Txz78fg_oC>m;0XwS?QCs
zHTR#ir_le}6@UJ_J^gR4^?!fh&;Pxr{^y7Mt6%)-zx34q_lK_3e~zc?Ss9mAF`ofO
O2!p4qpUXO@geCy?n+mT0

literal 0
HcmV?d00001

diff --git a/rend3-test/tests/results/object/multi-frame-add-1.png b/rend3-test/tests/results/object/multi-frame-add-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe6c076e7aca24deafa2839e35f6da65263006e8
GIT binary patch
literal 435
zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV65_VaSW+oeEWcLf{N!P6;Gk=
zRWo<HySSvKrG<q8K{*I~KSK*p0x~*09Arycnwy(T%kJ4Txz78fg_oC>m;0XwS?QCs
zHTU2DMW6mlPyK&?#h?FfPyd^1{ofz>^MCKD|M?;R>K6}LsXxlkR<Zm&eM9aUFftfC
MUHx3vIVCg!0Ji}QAOHXW

literal 0
HcmV?d00001

diff --git a/rend3-test/tests/results/shadow/cube.png b/rend3-test/tests/results/shadow/cube.png
index 37eb0e6001b8ba054649ae8a01b62fbbd043c422..586914b9ec35fa66d755dd83092349bf45c668ba 100644
GIT binary patch
delta 3478
zcmXX}eLRzU8=o3^IjYU1jADkVv>vo39IUBX^stj0B9-FM$nuiv)SaE9$GpF+!j{(-
zk{!bqr^ja;CGwWgydFg*iIPW3?0$YXXaDT;**?Gfx~}ir@47p8m26KpS%aIAm_9~B
zl_|unEtH(Q{Fhf{68E#N3a^<78_TS+J#VY?+;oh=e-)<EO*6p)H*0D0{g0I?*J?s6
zb9}Axqx;PWv6}=nAstcvudc!kNt`~p=8f{>q#>%d8e7;(z3;Qlu=4m;$?)hInWU8+
zn3TZ1-LK|^9msrfKzP8I=DaWG`W`4Omn6lCP6;UM@2_7N*vHAeDnn`5N!%Hz8cO$F
zjUhfOgML<L{F1Bez+OQyiHdIuZMC^S@Z-hngIx{H6qyA$Z6&pp;_?JUCT0Us<C?hK
zKAnbc{O=|d;ZbB3y&`^HCf%Zlk8;AIj88Tdm7^GAwv6BDXz%6k9(B<B_)L14Zm<c{
zlv&89$(lvg7yf0rVEd=~$SJ||t}SU_ES81gk~rd^P1WBI8M2IlFRn#L&*%QU%?~Xo
zb%>V5-A+Du8RbW<Wjx^g;IiAm@3ibn=3tkKwb=WIb$UG3BqhcJ-BTIlJG0hfQPuPE
zdHr9Su-E3+=}qoU%jkIo*%<t5)YheUy^7Rc2x+_8J7UHhSr(06@MNjqJi!ykRCt!;
zz39pgV^>{y@4IWHo;dWVb~xB?X6>_P-SP9=T#QTNzE>=e3jj0=Z6PPJ)$)d7qj<<q
zyXmUQ)hg*>r|KaUVr5+b{$|a|)=2z%GDqGgiX?6C)QF`*-pg{Ah(4_|0(MHf8YMHG
zONtlggO=ipJ`D>~glc($>e+Lzgb|dy|8&UXBiA%gWOeUhQ*OD^iyeqB;h446&r$L4
zG54PcymDm#p%wv@Pb2L<ksS5`&Fj!nuohL}W6}Pv?!gb##fy4p%qG!zb_6$th;!yY
z%^FcNgp?1S+MqCF_uvca;!ANvr!^|r&EVm7Aw4azHBaE6p5BCwpiesJ)0CIMMGc(@
zG?!WUcP;gY6n$Ab*SBz^TsewVu5y?0+8=Em>VNCQj`%FU+2IAB^Q?QXT9pEnhi9{Y
zb3d#zRlN4*!-ZM7&mohXgqKK72q|i;%DUD->F;;lr{aNA?!}ihQz_y_i1+fI9uun$
zx*6&}pc(2|GHQRl=(D`<*B9ML)s=axv?-|m&4Kt}lOa0(1=U>nAouQdB69et7nx6F
zPPe!*r%56Yrs~U2e>uFi`7HOTh+pkE;Its20P1CyIgGG7fkgrpf7AF+y}XynE2Q9q
zGc6^~SR;a_^yB1%!$ttefo94o6iElJHEgz%WVN5Z5`nS%HX53%pIN)Y%yE({-GZ?O
z&#shYrT0^>YK!vc<X#Fd&{5J;FSv-U9efImqqY{SSxEAW?t@_35dZ<ZQYSbwryqE#
zm7P4LTmGBF-9BUZ*cme;!Xq#E<3_*vHLgY)L}HvaC1%X+F6T}~+SgIGP#+=g79g&W
zh!LRx>+H49y56>5IE;~wL*+iBaexFeAR+#|S;d|YXgnYfKXjThtPh_ZFyi#VhmIil
zBXmnr!q|c474ADXEd2cO_5@UY^MDlwIAOIk4MRtxWR1-sLRY%8=St<K{?&n%s%y}I
zI$jvgS<&?l7!C(0=-L-_<;_ZGi+Y=fQ2lBRlmY#)-0T+zSLM1Ni{SVM&<kOuQ<Yg(
zsq`|wuL_lV!5kjy^94_~1R{5lWpG(LL@`m-wsRepsiU7Z;%=0S7XuQ3E}nu1cB);Y
zq3F0!MbEjJPx72Fb|jbxL`xEJb|<1HS#v}!Q1Ezf`Xg8sI42-}MNpoai74PCPA)0_
z@hsHC5ATGJUh)1WFLF1@Y4svo{ZpBAgQN?%<ckkt0924X#2^6xodM7cAZo-Xu-;8*
zRZ2Rj_a}q3nr9LyJL}%d6~6{10dFGV!6SLZ6A9b|F71Sls8ZNr`)9-k@8t!F^VmH-
zLy**`F>$@ds?{k_L6o6Xz=@qCcy5>Gb80T<g+w>NYKkA1Ok3OtTv)5hox}xJ(bE8m
z_ab_Rx?ouwpKO;-O$J4NA?JyDQ{Ob29(@@6E&u+IeX}yYnr<)wC$e^UFPD4Ny&okq
zoiPlWV2Cd+`C`5PLf22BqN!km<F#o*HKn=!268t<rnK5%#$awc9SbDvLVI%$vUcT*
z_hU$0UpoM8$6EtFut?jo|BLT>U-QkFZ$2%J(Cw4q9fLLm-rzi#weWnO{1~50%;ly`
z2lk3f1RVqN*i_aIX?|qvtMJNi`D0(THx%*{!-%s`W36wMBvJ?fI3ix$P^b<iu5oSO
zRMeX~IBziHe+PU=8%+D3wjUgaxb=|>9^bd*m|M6k1yuI`*y#mtM-G`dm+86`s22$f
zq8$7kgW<n?n2Ks)htKi<rorJo=kr;1Obz895fqPtjo$SPK*qLRg!FxPoG_p@00P;+
zfX|!`GMLUQ);^@v9b&Xo9<;7NeK&-T^>-WxSZ6VR*b2#;W~a8f(TxdT<wf}&h<t#7
z%ZhNsZrn~;OsW6D>0kpt`+t|+5F>ML6{f(<?-Mhop^q;bYR*D&+F0-Ucc6OyVN9>O
zuet?ES`y=M@Um9X5vD>i{IDe=mIrSSqmJdSlY={7KOWYyGN;9XBt<xYtwhj~J5I)w
zHV)um-WpI{+?(3RZtRY~{PmA#=Wd^X$9$L<)nRY9rUtg85y*DC_wubo^*ie=vd^JK
zF}+utUx~E%gPsrbibv+$ruHkXJ18;_bk1EIAOShC0w&p<$J&vZPA@b3M^QpilwsSl
z!_ixv(C7R&pT@4B_~*4??|xu`q*O3}SP|}yQ|`036wy@aa`JB@bD+fcEeG{JnwDh1
zzu1cFzMLc<4#HxV0@A~f)L3O;lxfV2BCT=p2t7D`X>owd0s^KE4xORawm^5n1OKzE
z70jbH!n&fP;ze&DCs^H66Jjt<1%9VWr^-N`qzRHVaE~A|RvE4=2A6)+oHpBp2O!gN
zWdKJZ=6I5@GQ1_lLR$RAI&~Ki6Lb0!p!<aH%-ly&sbA#W{*atr)>nCfyGdrLf0q{-
z3ouI~00a`9F!VAYkItYE0n|ffhHXu%ZBWv}>_^Vh^>vYzm*ezHzR6GhY&kG&+qnvL
z8V<uS%=BRYboQX4i`rlB1P~N(qGor7`OTh_Kr>sJik{CnQ39+*k)=hhY-h?RVB0Zd
z8S$P+gHrh&yb<Q)zMJ1~A$c2c*4J?3o6RCL>`lcpNZj({#H?R88bN2<AEai45I}zQ
zP@i?_0kahYbTktS4m=|UX@*XhjwFZ2WFT<b{uz}q`wmM^3{*H~C(wRJL`$w+6>T}+
z*AuNNk^EG8<==Y+66lqIDCT}pWa_TJ+vKlbF4dPF`7o6MYET${^F>I3eJkmNanAZ3
zqXTI@ziz5<922$JAmdcI)6qx(BanL|>zk{QCt&sRp99?Gq+kna;Kc|Y*l_F}m)~-}
z?8+1I4V8anH}9!_Z023x0_-67<#Ta&%r*(^dIy29*T;ZNLY(AZdyB2J!Ny?5YG~NZ
zny0QLXIoMb(?LLWQx<gIdK!tlf}Eg(=Ue9A)&2Lbrbd?;J?X@!B?r2FHoOBUVl`JB
zi}{H-pq`1%qJMG{nCFb+_`AOxP_JzwI`#)7yP3ohl@Hn|o}o`x3C+g>LRq^<)k77)
zGfnkLKoD1(yJs#!xJKpUYP#?>h*i6P<Y}8pk`gp55&~}w&POc^R|k)-Tz3tLIOib3
z@3hregnhte<78Y@=RSEU)N%aN^NXPh&QkmQX)%&%&ze<SL5)=%=9}?Yc1TDemaP~s
z*`hea)HuFzhyjg3Nhj-1M@|4QwSXlKGG-RONPRxfW^W6W)OGwCDQJIfC2*TF_+jeg
z>o~S`YRbAQ_p_~Q2kxHv>{Mae*4ME1Oz!;d&rXNuhS_sYvMnXeYr|%QFYiIRK>qSV
zb&$Y}qA0_(v?-O{X4T!-Gr}oPkn37M`{r3n+HL4jh!wsf#jQLJq<epJ4tO=Vv1Oqe
zY<pNk8ON-`y@s+o7?1xzJXS2t*n&s(n^k9w)AtG@<x!bE^yy3NtcvczM>T%ck}JvO
zN_QXuI1w#6%B-+YCo(JezIFMub_yj&qH&FecFrM#YZXN-WF@d-jp9k&Dn=S~g8I6V
zw-gkXq2e^XZ^oQQy20A90f(Xg55*u5NYRHK1;Afhz-DmiN1`Y5O`Qc7(sJbw&JGi}
p6K1ny7T5?LW}Q%(sxH&bP{B1ha$3Uce-WX;$JOO9T}(T{{Xcm!!zus(

literal 5966
zcmeHLeK?fq{(mquO-+q4mUd`|p4F%}D*Hrf4Le>&5wn#}5lhD$8LhWUxwaXkjJIWo
zm6B>F;xv(1%Q}ekBH5zy)^BTU*_5`{%0|rj-p^3&p6mR1uIqRGerK-B#XRQu-rxKC
zdAsM4d3n0hY46i845JI&_(BZRf<Lt|svi9EjVunvFe5_&-*H2X*3f;e3|t_`1<aI^
zyjoX2KR7se(?)phL9fy4|A|*LD(&NgRtXMvap51R&CjE?uM76svun?u_3dcMb%75{
zN|j^k<ZBy!{6^G<hf9?;gX+bLeEfFZR{Ly#Hzn$G#lE|4Z&dqxe>YfCI$3`*^7clb
z35oN6Z~Tu8r~XRl_x&^S5f0A3#yX6(sT@A|KaI-$5+hlMZCiaaF?Tz=*FGccsH~%}
zooH_(i1tRJYA>hg>N;_hL$CeHUf`jQ*LQ=7=hIe1ShR3;1YLR@cSojY&_XOHiMrfC
z6i;XB$Av975Hq+sSg+g3P8u&ba3OYrr@uYaHmemkr{&RCm#5+1VC*24qorZ5twAq`
zYRCxSab%^F{y^<n(8G4(s32G#c99retY9jzi=GXI7#kA>rZQRrf^>|r-hdz;Q?cPB
zOB}?;pofM$`~32>uxBE+7=x;Zxrx^)Y#h(fcIM1<HZ!jtRGTjHspHA9jQAXb4%;7E
zG70*23oN-X+dcc@ggAjN4QuF<rY-A&Qbd>W0)DnrcCtAmK*zWj>(WaNvo=mh(|khc
z<!<Kg%*0v(GWgl<L;Kv%R&O}D0S5smT-#`sjkC1ZRmmbYFQbQ^j%c$<=Ua9vi#q3s
zXWwtu>y?(%i7NE7Opo0wZDdpmYYVuBVzEvOQPtQ&1c-xlJYhqyq>Q0>^7Gg{w}2qF
zSZtv9b>H0izx!X-lMTF_`@ONxM5zXXj(~=iyws*6BR934Jk)QU@C#<=Uni=PxLvX^
z4A1;1B&<1vWzPC0D`a!WX_;`b_oAwh{)WNu<7W+u9*bsWlo|9MpzuNs!j6=ASQ-j0
z5X+Rg+e4!(?2_Xvmi;|vUDJ`FEK%{qY_T{fAKhJcRlSPoGjXU8K>85s-Ee{NIawx(
z)7FZ93Ec85Dq6qh5}`Q0V*b!)V|Mofbp#hew%6^+6fdY4%Hp5>$)^01YAWF!J1Ev!
zJ@2cRSao=K=FWcO<N(|1E8=my^0-d0?>XZ+OPPvWVNVOGr$2AI&^M#-w^=7&{$%sR
z!4I*4xl3<-F@HgkEW|eOA5qE0rrP%(_>QRGJ6yUAPj-5|YOu3oY+gm0N7Z`2Rz+>@
z_I%^q@zmBYv7YKc-)E<|KiP!Wk4);abVdF9?dY>R8lu==Esa2z4i-xz2<*Ja-$$*a
z6k;^ANlq-;yR+%RhtIyPiJs$p$xdtTK#BTWs&d~6an{+)?gkg8kfHctWW}S^MP96e
z{EAE{7DY^cbkyKR%W&DNZ^J#Iula{dCpQ9~MG+RQb;Gu7u=jdKBO=L3mQ$QokT__e
zo4BAl=ayA9czhyNIcD@%i^mkUI6OWLXE|m=AWT^CAC?$h(K~8uAPQVqRqheIoF00|
zbq4GT+(}}FNX;~k!)DH|Do;BG)3)#FhlI!l#wW`8d`D?mS-C-Pwo<hJsf~B+^etXh
zSBcC#V_=$n3f*)1Dz6GY8UQ>myBZSsWpqR2=qKo3mhq+AI0Cmg7N?9(?3R0CtVD<_
z|70)?cP16lwabOJ!=dO$`-3Q#)UzO@63^2KtP2(pE$HR5@7OM!Uc*bytz59evhozr
z#!%WcYc!+d3fiS}`0e^DgudgM!@}5_?xWo;?_E~bC33-aH7i{sZs1!36L21%wd2Q*
zFOKnO(-r?lp{AA_tiHPL&J3gSEcE##vNLa*yg`pGev0Eny8ow-2f(=0P8-mOL&0&n
zS_tC|FRJqc7?u&~ISI#Xv#vLTJUxPge9vt*eY))I97tPrVzKXtwg9`xMCDdTSajIl
zy5ow92`N5xUh*U%W)14q`dJhOTuADPn4UnS#o3RaQrKWr$j$oRC&34za$s#z<XwVL
zgVv@scVY+7G78{U9Dagot_|MSZ*v1bS;(DQWnvbEDn>AJ<CVv6f-uN`kb5EBse$Nz
zKkUfYcU-3!28(b7oysQ9Yqm^<aEI>w`+dhysX&U)M2dKUu40q>(9gk9kMg&g)ITue
z!a-xCo`q-`cK+j|bSx1|;(}-W*!MuPqWAUsfc(<QR)EwWmx;<7TRhKye6VTF<jSmO
zHAP-MSZ^}LXO@;UOx$ndO-t!7vb=Yc#NpaTwRx|bBX~qIFy8&{DQX5ubeO1>{D2eR
z^0#5#uC8E{JHJ)v2JPXqb5NJdHz@aI@>#`BOnpbbqgk0uxcSB-6Tj+1rIX`bE=-@g
zD{#Ac3iX>gmS2Za6i<rpr8ccKk_pdDq@KDryslJ7Id*O#B*{kj*g11BDS1Y`?WQw+
z>sRJiNM0;a>|Qc=X?f3@R<(uaDMou^ixd|xm<d6p-4Mc3eP@^E1o4OAFL+g%&9ed_
zTYN`8fRmZp!kd=NWA^l&j$TuFJfb?bK)Be?q5t{CFMqOmULWTKX(_kn$c$XfAc6!K
zxB0Jx9<Ie93u8K}z0kS5`ZyGhxLwZ%TWG_*Yvwo?Qg;j$qt2qd7AamX8Qt@jU;OV&
zRzWj7CZ=8HsEWfQ9LRU06>0cvxNI+>kyl_1lz^B#X1^GQ>g@WkMDi_HS74*d?!8Qw
zgw2{LHtld2cz*F-z_7z%95rT~L6OD~Zz^KYVnLg<+KGwkLvwit!tl?btV%bldeHw6
z?Awgx=*^LJJ7xEH*A_I@rJ&#Gr|MEtU1MLb-wth{rt-BS63P0Qo9Zv(3~P=uqj>_2
zRe<IKW{Ih~XD5|TCO~XCq*4@{SaIe_k@2@iq9=|{%o17pwMP=7y)QnQEXr(-TA{gH
zhci<E?!?R$U`ZlPcbZYOlMO=(ort_GsZQbzLdbj77WA~+ejgntTvVAKaUnLHSn}%l
zl@J!%7+s2MrpTe26(nehaWmO?w+qIU$9zs+SG_8CRFxQ+0xF-uaU~hYy>}>fuem4o
z^tE>#+L!(kP&qar(MD4<!^)Ncr~(Y|%$>)`bxwj(fC5$qt5`F%F*c4Z_c*{)?XS|x
zIDl_s01<Zq5sxo|Gm=MRVCbQqU7h<|8ZQlJZ+SdscgW~jWB2dB;HV)c;!A-;BCv!c
zj#gDBMGBY{T+)-Znoo@r*2N%zYeA0=$IOfMFEE5&JJzEn-{YUjRt9~yC-cE`rrM93
zJ<M%tmeWXBL+>xddd-G!NJK3I+<`z26xe=iwlGsWv1`d?P|Mx+uKJjC;p$Blx~@jg
z9@HnGMrP`$Xe6ZAG`@da!`I?-4acA$0}8HnrU(zQuCI(2uHFG+xuxq4bZ@`;Z13q2
zGqpmTf$<P|(0<7Y0GoG0b4V!Dm%66tbz8z%<TH_*#Xn;x0FcsqM&Oh(ZgfLflm2jW
ze(74#t8?;$AZLt_qt>R$GgmWOY`SDCu$1R(CNpX#z`UGY9+s#HZ<>(f@wce_TO=~i
z_Ck+fB`Krwjw_NX>XiO4z>}hvSbbe@CSY8U07@olgcLZR2uWgS)cv9l37iI7-)hO1
zA%UYEuXot$<*wLm>Ka=+dxp`g_9@v&H9*Tm%Q#A}H$+OqaBEJ%p$;yEX|@!R7kP$+
z3cLS`p^zTHiK?$YBiUZ9fyeI9z%P#+Qr?mGTaYMHFgDflnHX3%;LHMAB2p>G=7L|S
zmzisy`IHu%;-T5`S|dwz9tAV~I%oW~D=E1@*#ijhA~e)oF^}QnMiwLF7(+^DTBXv#
znSTbRJ0#=oXSv$W3<cC$m0vD-Kw7TR6085YEWri#E8liE&FMeT)Yl@8PD){$^^|rT
zgnZ5}H>Yxsv!>v<0i)WA-JLsIZeFUm5OGODW#jYR-W55Isqu#?LqnDUnj|}(m;sKV
z-V7&@PibJkz@gtN?8>)V`dyAxGoREAzmDzZyf7iTgKP@}|J`ew=7cti#7r7HOj9;a
z?!H8A4X23@tTe)YVyt)q36yYe7z}cJ9!RD!@AX#8=`AAJ&SjZB8?d2i6*5RN!5ooF
zflOIxbQ&U)9_n3BR=O=33i~_icT$fuuX8+WTA6>dZK)6lw8k}e>GJu0svj>xci`kG
zwJ@Y;gen<9pvzeXQxQaw0!cwx1FotP34HsRI49iUtj}$3@jUSo+^uCYF)F*K{79Q!
zO^lU#3#ky&`oXCePsh$5Lnv1Owg+a%@R^taXD0P3{kTpuT{yHsixDx8wNc#khmV?s
zec7CHP19lw@ebY<T^!8{|0WtF9}2R!WKx&_@IQ+3KLP{F5w6aek-OP%)#VhYe*qQ|
zn{=~jbq@q1ZI@0qLCi=hxw@8a9!rQl#g#=*L?%xQioiA6ZEM1<x-GVCfb7>O_Kk|C
zod*e91c{)DUR8;jfzViQ<{r&(($7|&B9oyLzWAlT+&yp3(qH3&A{du~y~%ska>)C4
zc!PXXYI6Y7AS{n@oDOXD3imT^aX7I%-Mgk>Cm~X+?A(Vozg&^Nw6AIS6hjmFszh&%
zHP9mR(zGCMRm*|TP-4jx;3Ul8JIb-Uh~AbYCc|lkyG<*f*!9)A!FN=V$xwl4i|KPX
z)1XM91SB-;2&TN??Wq8OBEu&RoPt6EJrxV>nWXDFnv}w^@X5eafBUs(OiL5i<kXHv
zIFK1lMpqj`e46lo()1r=p*>mXMWLN&3v<8w+4#Iexi>1JR5qm@cO#KnNkYGq0vb>?
zF^hr$wfBykLr6KZKqeKJ^g7BO@g)Y~R%DR4y@mKKo^Y_zIKzuRlIFK1#-NOPhJqtI
z9S|Fob;rWGq0W9~kvCPqA>G#+9p&7xZ_JvbP{t!+P<c*Zc%&b|;V>Q!Nndc|-`C>7
z7aGa^OEL2shnel~+2_Yp-QO!4SdY#y?E-rVFv%7@%&X69-l$&l|L=L86CCdU`vmiU
e&EwtQUE|jnoXU@!`8_-*#{|xve5F%B{67IOfms9q

diff --git a/rend3-test/tests/results/simple/empty.png b/rend3-test/tests/results/simple/empty.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfac4a664f2ba7cb7161734ce7e528a42f2daaf0
GIT binary patch
literal 434
zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7T#lEV660XaSW+oeEWcLf{N!P6;Gk=
zRWo<HySSvKrG<q8K{*I~KSK*p0x~*09Arycnwy(T%kJ4Txz78fg_oC>m;0XwS?QDX
z^uM{*|NVhK|M#BypC9tCe(|UO(o_H6U-9R^+fbG|X3z9!0V88cM^rX2E*Lyr{an^L
HB{Ts5<$4H#

literal 0
HcmV?d00001

diff --git a/rend3-test/tests/root.rs b/rend3-test/tests/root.rs
index 45080131..ab1b6f0f 100644
--- a/rend3-test/tests/root.rs
+++ b/rend3-test/tests/root.rs
@@ -1,3 +1,4 @@
 mod msaa;
+mod object;
 mod shadow;
 mod simple;
diff --git a/rend3-test/tests/shadow.rs b/rend3-test/tests/shadow.rs
index ed717418..36ab0943 100644
--- a/rend3-test/tests/shadow.rs
+++ b/rend3-test/tests/shadow.rs
@@ -3,7 +3,7 @@ use std::f32::consts::FRAC_PI_2;
 use anyhow::Context;
 use glam::{Mat4, Quat, Vec3, Vec3A, Vec4};
 use rend3::types::{Camera, Handedness};
-use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner};
+use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold};
 
 #[test_attr]
 pub async fn shadows() -> anyhow::Result<()> {
@@ -34,7 +34,14 @@ pub async fn shadows() -> anyhow::Result<()> {
 
     let file_name = "tests/results/shadow/plane.png";
     runner
-        .render_and_compare(FrameRenderSettings::new().size(256)?, file_name, 0.02)
+        .render_and_compare(
+            FrameRenderSettings::new().size(256)?,
+            file_name,
+            Threshold::Percentile {
+                percentile: 0.5,
+                threshold: 0.04,
+            },
+        )
         .await?;
 
     let material2 = runner.add_lit_material(Vec4::new(0.75, 0.5, 0.25, 1.0));
@@ -46,7 +53,14 @@ pub async fn shadows() -> anyhow::Result<()> {
 
     let file_name = "tests/results/shadow/cube.png";
     runner
-        .render_and_compare(FrameRenderSettings::new().size(256)?, file_name, 0.02)
+        .render_and_compare(
+            FrameRenderSettings::new().size(256)?,
+            file_name,
+            Threshold::Percentile {
+                percentile: 0.5,
+                threshold: 0.04,
+            },
+        )
         .await?;
 
     Ok(())
diff --git a/rend3-test/tests/simple.rs b/rend3-test/tests/simple.rs
index 65e28753..684c69ce 100644
--- a/rend3-test/tests/simple.rs
+++ b/rend3-test/tests/simple.rs
@@ -1,9 +1,35 @@
 use anyhow::Context;
 use glam::{Mat4, Vec3, Vec4};
 use rend3::types::{Camera, Handedness, MeshBuilder, Object, ObjectMeshKind};
-use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner};
+use rend3_test::{no_gpu_return, test_attr, FrameRenderSettings, TestRunner, Threshold};
 use wgpu::FrontFace;
 
+#[test_attr]
+pub async fn empty() -> anyhow::Result<()> {
+    let iad = no_gpu_return!(rend3::create_iad(None, None, None, None).await)
+        .context("InstanceAdapterDevice creation failed")?;
+
+    let Ok(runner) = TestRunner::builder().iad(iad).build().await else {
+        return Ok(());
+    };
+
+    runner.set_camera_data(Camera {
+        projection: rend3::types::CameraProjection::Raw(Mat4::IDENTITY),
+        view: Mat4::IDENTITY,
+    });
+
+    runner
+        .render_and_compare(
+            FrameRenderSettings::new(),
+            "tests/results/simple/empty.png",
+            Threshold::Mean(0.0),
+        )
+        .await
+        .context("Image Comparison Failed")?;
+
+    Ok(())
+}
+
 #[test_attr]
 pub async fn triangle() -> anyhow::Result<()> {
     let tests = [
@@ -67,8 +93,13 @@ pub async fn triangle() -> anyhow::Result<()> {
             false => "tests/results/simple/triangle-backface.png",
         };
         runner
-            .render_and_compare(FrameRenderSettings::new(), file_name, 0.0)
-            .await?;
+            .render_and_compare(FrameRenderSettings::new(), file_name, Threshold::Mean(0.0))
+            .await
+            .with_context(|| {
+                format!(
+                "Comparison failed on test (Handedness::{handedness:?}, FrontFace::{winding:?}, Visible: {visible})"
+            )
+            })?;
     }
 
     Ok(())
@@ -135,8 +166,9 @@ pub async fn coordinate_space() -> anyhow::Result<()> {
 
         let file_name = format!("tests/results/simple/coordinate-space-{name}.png");
         runner
-            .render_and_compare(FrameRenderSettings::new(), file_name, 0.0)
-            .await?;
+            .render_and_compare(FrameRenderSettings::new(), file_name, Threshold::Mean(0.0))
+            .await
+            .with_context(|| format!("Comparison failed on test {name}"))?;
     }
 
     Ok(())
diff --git a/rend3-types/src/lib.rs b/rend3-types/src/lib.rs
index 24694919..6397043e 100644
--- a/rend3-types/src/lib.rs
+++ b/rend3-types/src/lib.rs
@@ -194,6 +194,18 @@ pub type GraphDataHandleUntyped = ResourceHandle<GraphDataTag>;
 /// Refcounted handle to an instance of GraphData
 pub struct GraphDataHandle<T>(pub GraphDataHandleUntyped, pub PhantomData<T>);
 
+impl<T> Debug for GraphDataHandle<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+impl<T> Clone for GraphDataHandle<T> {
+    fn clone(&self) -> Self {
+        Self(self.0.clone(), PhantomData)
+    }
+}
+
 /// Internal non-owning handle to a Mesh
 pub type RawMeshHandle = RawResourceHandle<Mesh>;
 /// Internal non-owning handle to a Texture2D
@@ -252,7 +264,9 @@ pub use wgt::{
 ///
 /// The value allows for 8 bits of information packed in the high 8 bits of the
 /// index for object recombination.
-pub const MAX_VERTEX_COUNT: u32 = 1 << 24;
+///
+/// We leave exactly one value at the top for the "invalid vertex" value: 0x00_FF_FF_FF;
+pub const MAX_VERTEX_COUNT: u32 = (1 << 24) - 1;
 /// The maximum amount of indices any one object can have.
 pub const MAX_INDEX_COUNT: u32 = u32::MAX;
 
@@ -1172,8 +1186,8 @@ impl SampleCount {
     pub const ARRAY: [Self; 2] = [Self::One, Self::Four];
 
     /// Determines if a resolve texture is needed for this texture.
-    pub fn needs_resolve(self) -> bool {
-        self != Self::One
+    pub const fn needs_resolve(self) -> bool {
+        !matches!(self, Self::One)
     }
 }
 
@@ -1193,6 +1207,15 @@ pub enum Handedness {
     Right,
 }
 
+impl From<Handedness> for wgt::FrontFace {
+    fn from(value: Handedness) -> Self {
+        match value {
+            Handedness::Left => Self::Cw,
+            Handedness::Right => Self::Ccw,
+        }
+    }
+}
+
 impl Default for Handedness {
     fn default() -> Self {
         Self::Left
diff --git a/rend3/shaders/vertex_attributes.wgsl b/rend3/shaders/vertex_attributes.wgsl
index 663d1f94..baa5bf52 100644
--- a/rend3/shaders/vertex_attributes.wgsl
+++ b/rend3/shaders/vertex_attributes.wgsl
@@ -1,8 +1,43 @@
 // -- DO NOT VALIDATE --
 
 struct Indices {
-    vertex: u32,
     object: u32,
+    vertex: u32,
+}
+
+struct BatchIndices {
+    /// Index _within_ the batch
+    local_object: u32,
+    /// Vertex index within the object
+    vertex: u32,
+}
+
+const INVALID_VERTEX: u32 = 0x00FFFFFFu;
+
+fn unpack_batch_index(vertex_index: u32) -> BatchIndices {
+    return BatchIndices(
+        vertex_index >> 24u,
+        vertex_index & 0xFFFFFFu,
+    );
+}
+
+fn pack_batch_index(local_object: u32, index: u32) -> u32 {
+    return (local_object << 24u) | (index & 0xFFFFFFu);
+}
+
+alias TriangleVertices = array<vec3f, 3>;
+alias TriangleIndices = array<u32, 3>;
+struct Triangle {
+    vertices: TriangleVertices,
+    indices: TriangleIndices,
+}
+
+fn pack_batch_indices(local_object: u32, indices: TriangleIndices) -> TriangleIndices {
+    return TriangleIndices(
+        pack_batch_index(local_object, indices[0]),
+        pack_batch_index(local_object, indices[1]),
+        pack_batch_index(local_object, indices[2]),
+    );
 }
 
 fn extract_attribute_vec2_f32(byte_base_offset: u32, vertex_index: u32) -> vec2<f32> {
diff --git a/rend3/src/graph/data_handle.rs b/rend3/src/graph/data_handle.rs
deleted file mode 100644
index 85ed170d..00000000
--- a/rend3/src/graph/data_handle.rs
+++ /dev/null
@@ -1,37 +0,0 @@
-use std::{
-    ops::Deref,
-    sync::atomic::{AtomicUsize, Ordering},
-};
-
-use once_cell::sync::Lazy;
-
-static DATA_HANDLE_INDEX_ALLOCATOR: AtomicUsize = AtomicUsize::new(0);
-
-pub struct RenderGraphConnection {
-    name: &'static str,
-    index: Lazy<usize>,
-}
-impl RenderGraphConnection {
-    pub const fn new(name: &'static str) -> Self {
-        Self {
-            name,
-            index: Lazy::new(|| DATA_HANDLE_INDEX_ALLOCATOR.fetch_add(1, Ordering::Relaxed)),
-        }
-    }
-
-    pub const fn name(&self) -> &'static str {
-        self.name
-    }
-
-    pub fn index(&self) -> usize {
-        *self.index
-    }
-}
-
-impl Deref for RenderGraphConnection {
-    type Target = usize;
-
-    fn deref(&self) -> &Self::Target {
-        &self.index
-    }
-}
diff --git a/rend3/src/graph/graph.rs b/rend3/src/graph/graph.rs
index c5d2eb6d..6159b83d 100644
--- a/rend3/src/graph/graph.rs
+++ b/rend3/src/graph/graph.rs
@@ -112,6 +112,8 @@ impl<'node> RenderGraph<'node> {
                 idx,
                 layer_start: 0,
                 layer_end: desc.depth,
+                mip_start: 0,
+                mip_end: desc.to_core().mip_count(),
                 viewport: ViewportRect {
                     offset: UVec2::ZERO,
                     size: desc.resolution,
@@ -126,6 +128,7 @@ impl<'node> RenderGraph<'node> {
         &mut self,
         texture: &'node dyn AsTextureReference,
         layers: Range<u32>,
+        mips: Range<u8>,
         viewport: ViewportRect,
     ) -> RenderTargetHandle {
         let idx = self.imported_targets.len();
@@ -135,6 +138,8 @@ impl<'node> RenderGraph<'node> {
                 idx,
                 layer_start: layers.start,
                 layer_end: layers.end,
+                mip_start: mips.start,
+                mip_end: mips.end,
                 viewport,
             }),
         }
@@ -341,6 +346,8 @@ impl<'node> RenderGraph<'node> {
                         let view = active_textures[&region.idx].create_view(&TextureViewDescriptor {
                             base_array_layer: region.layer_start,
                             array_layer_count: Some(region.layer_end - region.layer_start),
+                            base_mip_level: region.mip_start as u32,
+                            mip_level_count: Some((region.mip_end - region.mip_start) as u32),
                             ..TextureViewDescriptor::default()
                         });
                         vacant.insert(view);
@@ -354,6 +361,8 @@ impl<'node> RenderGraph<'node> {
                                 .create_view(&TextureViewDescriptor {
                                     base_array_layer: region.layer_start,
                                     array_layer_count: Some(region.layer_end - region.layer_start),
+                                    base_mip_level: region.mip_start as u32,
+                                    mip_level_count: Some((region.mip_end - region.mip_start) as u32),
                                     ..TextureViewDescriptor::default()
                                 });
                         vacant.insert(view);
diff --git a/rend3/src/graph/mod.rs b/rend3/src/graph/mod.rs
index 4d913eb5..cd85fa92 100644
--- a/rend3/src/graph/mod.rs
+++ b/rend3/src/graph/mod.rs
@@ -47,11 +47,10 @@ use std::ops::Range;
 
 use glam::UVec2;
 use rend3_types::{SampleCount, TextureFormat, TextureUsages};
-use wgpu::{Color, TextureView};
+use wgpu::{Color, Extent3d, TextureDimension, TextureView};
 
 use crate::util::typedefs::SsoString;
 
-mod data_handle;
 mod encpass;
 #[allow(clippy::module_inception)] // lmao
 mod graph;
@@ -60,7 +59,6 @@ mod store;
 mod temp;
 mod texture_store;
 
-pub use data_handle::*;
 pub use encpass::*;
 pub use graph::*;
 pub use node::*;
@@ -75,6 +73,8 @@ pub struct RenderTargetDescriptor {
     pub resolution: UVec2,
     pub depth: u32,
     pub samples: SampleCount,
+    // None means maximum mip count
+    pub mip_levels: Option<u8>,
     pub format: TextureFormat,
     pub usage: TextureUsages,
 }
@@ -84,6 +84,7 @@ impl RenderTargetDescriptor {
             resolution: self.resolution,
             depth: self.depth,
             samples: self.samples,
+            mip_levels: self.mip_levels,
             format: self.format,
             usage: self.usage,
         }
@@ -95,10 +96,26 @@ pub(crate) struct RenderTargetCore {
     pub resolution: UVec2,
     pub depth: u32,
     pub samples: SampleCount,
+    pub mip_levels: Option<u8>,
     pub format: TextureFormat,
     pub usage: TextureUsages,
 }
 
+impl RenderTargetCore {
+    fn mip_count(&self) -> u8 {
+        match self.mip_levels {
+            Some(count) => count,
+            None => Extent3d {
+                width: self.resolution.x,
+                height: self.resolution.y,
+                // D2 doesn't care about depth
+                depth_or_array_layers: 1,
+            }
+            .max_mips(TextureDimension::D2) as u8,
+        }
+    }
+}
+
 /// Requirements to render to a particular shadow map.
 ///
 /// view + size form the start/end of the viewport to render to.
@@ -124,6 +141,8 @@ pub(super) struct TextureRegion {
     idx: usize,
     layer_start: u32,
     layer_end: u32,
+    mip_start: u8,
+    mip_end: u8,
     viewport: ViewportRect,
 }
 
@@ -174,7 +193,11 @@ impl RenderTargetHandle {
         let left = self.to_region();
         let right = other.to_region();
 
-        left.idx == right.idx && left.layer_start == right.layer_start && left.layer_end == right.layer_end
+        left.idx == right.idx
+            && left.layer_start == right.layer_start
+            && left.layer_end == right.layer_end
+            && left.mip_start == right.mip_start
+            && left.mip_end == right.mip_end
     }
 
     pub(super) fn to_region(self) -> TextureRegion {
@@ -184,11 +207,31 @@ impl RenderTargetHandle {
         }
     }
 
-    pub fn restrict(mut self, layers: Range<u32>, viewport: ViewportRect) -> Self {
+    pub fn set_layers(mut self, layers: Range<u32>) -> Self {
         match &mut self.resource {
             GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => {
                 region.layer_start = layers.start;
                 region.layer_end = layers.end;
+            }
+            _ => unreachable!(),
+        }
+        self
+    }
+
+    pub fn set_mips(mut self, mips: Range<u8>) -> Self {
+        match &mut self.resource {
+            GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => {
+                region.mip_start = mips.start;
+                region.mip_end = mips.end;
+            }
+            _ => unreachable!(),
+        }
+        self
+    }
+
+    pub fn set_viewport(mut self, viewport: ViewportRect) -> Self {
+        match &mut self.resource {
+            GraphSubResource::ImportedTexture(region) | GraphSubResource::Texture(region) => {
                 region.viewport = viewport;
             }
             _ => unreachable!(),
diff --git a/rend3/src/graph/node.rs b/rend3/src/graph/node.rs
index 4ade24e6..c1f29ea0 100644
--- a/rend3/src/graph/node.rs
+++ b/rend3/src/graph/node.rs
@@ -136,6 +136,21 @@ impl<'a, 'node> RenderGraphNodeBuilder<'a, 'node> {
         DeclaredDependency { handle }
     }
 
+    /// Sugar over [add_data] which makes it easy to
+    /// declare optional textures.
+    ///
+    /// [add_data]: RenderGraphNodeBuilder::add_data
+    pub fn add_optional_data<T>(
+        &mut self,
+        handle: Option<DataHandle<T>>,
+        usage: NodeResourceUsage,
+    ) -> Option<DeclaredDependency<DataHandle<T>>>
+    where
+        T: 'static,
+    {
+        Some(self.add_data(handle?, usage))
+    }
+
     /// Declares a data handle as having the given render targets
     pub fn add_dependencies_to_render_targets<T>(
         &mut self,
diff --git a/rend3/src/graph/texture_store.rs b/rend3/src/graph/texture_store.rs
index 8822ddc5..07981fca 100644
--- a/rend3/src/graph/texture_store.rs
+++ b/rend3/src/graph/texture_store.rs
@@ -37,7 +37,7 @@ impl GraphTextureStore {
                 height: desc.resolution.y,
                 depth_or_array_layers: 1,
             },
-            mip_level_count: 1,
+            mip_level_count: desc.mip_count() as u32,
             sample_count: desc.samples as _,
             dimension: TextureDimension::D2,
             format: desc.format,
diff --git a/rend3/src/managers/graph_storage.rs b/rend3/src/managers/graph_storage.rs
index 0e3888e0..c96d948b 100644
--- a/rend3/src/managers/graph_storage.rs
+++ b/rend3/src/managers/graph_storage.rs
@@ -1,11 +1,14 @@
-use std::{any::Any, ops::DerefMut};
+use std::{
+    any::Any,
+    ops::{Deref, DerefMut},
+};
 
-use parking_lot::Mutex;
+use parking_lot::RwLock;
 use rend3_types::{GraphDataHandle, RawGraphDataHandleUntyped, WasmNotSend};
 
 #[derive(Default)]
 pub struct GraphStorage {
-    // Type under any is Mutex<T>
+    // Type under any is RwLock<T>
     #[cfg(not(target_arch = "wasm32"))]
     data: Vec<Option<Box<dyn Any + Send>>>,
     #[cfg(target_arch = "wasm32")]
@@ -21,14 +24,21 @@ impl GraphStorage {
         if handle.idx >= self.data.len() {
             self.data.resize_with(handle.idx + 1, || None);
         }
-        self.data[handle.idx] = Some(Box::new(Mutex::new(data)));
+        self.data[handle.idx] = Some(Box::new(RwLock::new(data)));
+    }
+
+    pub fn get<T: 'static>(&self, handle: &GraphDataHandle<T>) -> impl Deref<Target = T> + '_ {
+        let rw_lock: &RwLock<T> = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap();
+        rw_lock
+            .try_read()
+            .expect("Called get on the same handle that was already borrowed mutably within a renderpass")
     }
 
     pub fn get_mut<T: 'static>(&self, handle: &GraphDataHandle<T>) -> impl DerefMut<Target = T> + '_ {
-        let mutex: &Mutex<T> = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap();
-        mutex
-            .try_lock()
-            .expect("Tried to call get_mut on the same handle twice")
+        let rw_lock: &RwLock<T> = self.data[handle.0.idx].as_ref().unwrap().downcast_ref().unwrap();
+        rw_lock
+            .try_write()
+            .expect("Tried to call get_mut on the same handle twice within a renderpass")
     }
 
     pub fn remove(&mut self, handle: &RawGraphDataHandleUntyped) {
diff --git a/rend3/src/managers/handle_alloc.rs b/rend3/src/managers/handle_alloc.rs
index c22d8180..20dba352 100644
--- a/rend3/src/managers/handle_alloc.rs
+++ b/rend3/src/managers/handle_alloc.rs
@@ -18,6 +18,15 @@ where
 {
     max_allocated: AtomicUsize,
     freelist: Mutex<Vec<usize>>,
+    /// We want the render routines to be able to rely on deleted handles being valid for at
+    /// least one frame.
+    ///
+    /// To facilitate this, we first put the handle in the delay list, then at the top of
+    /// every frame, we move the handles from the delay list to the freelist.
+    ///
+    /// We do not need to do this for everything though, only for Object handles, as these
+    /// are the root handle which the renderer accesses everything.
+    delay_list: Option<Mutex<Vec<usize>>>,
     _phantom: PhantomData<T>,
 }
 
@@ -25,10 +34,11 @@ impl<T> HandleAllocator<T>
 where
     RawResourceHandle<T>: DeletableRawResourceHandle,
 {
-    pub fn new() -> Self {
+    pub fn new(delay_handle_reclaimation: bool) -> Self {
         Self {
             max_allocated: AtomicUsize::new(0),
             freelist: Mutex::new(Vec::new()),
+            delay_list: delay_handle_reclaimation.then(|| Mutex::new(Vec::new())),
             _phantom: PhantomData,
         }
     }
@@ -49,15 +59,24 @@ where
 
     pub fn deallocate(&self, handle: RawResourceHandle<T>) {
         let idx = handle.idx;
-        self.freelist.lock().push(idx);
+        if let Some(ref delay_list) = self.delay_list {
+            delay_list.lock().push(idx);
+        } else {
+            self.freelist.lock().push(idx);
+        }
     }
-}
 
-impl<T> Default for HandleAllocator<T>
-where
-    RawResourceHandle<T>: DeletableRawResourceHandle,
-{
-    fn default() -> Self {
-        Self::new()
+    pub fn reclaim_delayed_handles(&self) -> Vec<RawResourceHandle<T>> {
+        if let Some(ref delay_list) = self.delay_list {
+            let mut locked_delay_list = delay_list.lock();
+
+            self.freelist.lock().extend_from_slice(&locked_delay_list);
+            locked_delay_list
+                .drain(..)
+                .map(|idx| RawResourceHandle::new(idx))
+                .collect()
+        } else {
+            Vec::new()
+        }
     }
 }
diff --git a/rend3/src/managers/object.rs b/rend3/src/managers/object.rs
index c8197cd1..b2bbb345 100644
--- a/rend3/src/managers/object.rs
+++ b/rend3/src/managers/object.rs
@@ -31,6 +31,8 @@ pub struct ShaderObject<M: Material> {
     pub material_index: u32,
     pub vertex_attribute_start_offsets:
         <M::SupportedAttributeArrayType as MaterialArray<&'static VertexAttributeId>>::U32Array,
+    // 1 if enabled, 0 if disabled
+    pub enabled: u32,
 }
 
 impl<M: Material> Default for ShaderObject<M> {
@@ -42,6 +44,7 @@ impl<M: Material> Default for ShaderObject<M> {
             index_count: Default::default(),
             material_index: Default::default(),
             vertex_attribute_start_offsets: Zeroable::zeroed(),
+            enabled: Default::default(),
         }
     }
 }
@@ -87,8 +90,8 @@ struct ObjectArchetype {
     buffer: FreelistDerivedBuffer,
     set_object_transform: fn(&mut WasmVecAny, &mut FreelistDerivedBuffer, usize, Mat4),
     duplicate_object: fn(&WasmVecAny, usize, ObjectChange) -> Object,
-    remove: fn(&mut WasmVecAny, usize),
-    evaluate: fn(&mut ObjectArchetype, &Device, &mut CommandEncoder, &ScatterCopy),
+    remove: fn(&mut ObjectArchetype, usize),
+    evaluate: fn(&mut ObjectArchetype, &Device, &mut CommandEncoder, &ScatterCopy, &[RawObjectHandle]),
 }
 
 /// Manages objects. That's it. ¯\\\_(ツ)\_/¯
@@ -167,14 +170,18 @@ impl ObjectManager {
 
         let archetype = self.archetype.get_mut(&type_id).unwrap();
 
-        (archetype.remove)(&mut archetype.data_vec, handle.idx);
-
-        archetype.object_count -= 1;
+        (archetype.remove)(archetype, handle.idx);
     }
 
-    pub fn evaluate(&mut self, device: &Device, encoder: &mut CommandEncoder, scatter: &ScatterCopy) {
+    pub fn evaluate(
+        &mut self,
+        device: &Device,
+        encoder: &mut CommandEncoder,
+        scatter: &ScatterCopy,
+        deferred_removals: &[RawObjectHandle],
+    ) {
         for archetype in self.archetype.values_mut() {
-            (archetype.evaluate)(archetype, device, encoder, scatter);
+            (archetype.evaluate)(archetype, device, encoder, scatter, deferred_removals);
         }
     }
 
@@ -298,6 +305,7 @@ pub(super) fn object_add_callback<M: Material>(_material: &M, args: ObjectAddCal
             first_index: (index_range.start / 4) as u32,
             index_count: ((index_range.end - index_range.start) / 4) as u32,
             vertex_attribute_start_offsets,
+            enabled: true as u32,
         },
         material_handle: args.object.material,
         mesh_kind: args.object.mesh_kind,
@@ -345,10 +353,21 @@ fn duplicate_object<M: Material>(data: &WasmVecAny, idx: usize, change: ObjectCh
     }
 }
 
-fn remove<M: Material>(data: &mut WasmVecAny, idx: usize) {
-    let data_vec = data.downcast_slice_mut::<Option<InternalObject<M>>>().unwrap();
+fn remove<M: Material>(archetype: &mut ObjectArchetype, idx: usize) {
+    let data_vec = archetype
+        .data_vec
+        .downcast_slice_mut::<Option<InternalObject<M>>>()
+        .unwrap();
 
-    data_vec[idx] = None;
+    // We don't actually remove the object at this point,
+    // we just mark it as disabled. Next frame, this handle
+    // will be provided in `deferred_removals` in `evaluate`
+    // so we can actually delete it.
+    //
+    // We defer objects one frame so that temporal culling
+    // has valid data.
+    archetype.buffer.use_index(idx);
+    data_vec[idx].as_mut().unwrap().inner.enabled = false as u32;
 }
 
 fn evaluate<M: Material>(
@@ -356,12 +375,23 @@ fn evaluate<M: Material>(
     device: &Device,
     encoder: &mut CommandEncoder,
     scatter: &ScatterCopy,
+    deferred_removals: &[RawObjectHandle],
 ) {
     let data_vec = archetype
         .data_vec
-        .downcast_slice::<Option<InternalObject<M>>>()
+        .downcast_slice_mut::<Option<InternalObject<M>>>()
         .unwrap();
 
+    for removal in deferred_removals {
+        // Only one archetype will have each handle,
+        // so if we have it, we can be sure it's ours.
+        let removed_obj = data_vec[removal.idx].take();
+
+        if removed_obj.is_some() {
+            archetype.object_count -= 1;
+        }
+    }
+
     archetype.buffer.apply(device, encoder, scatter, |idx| {
         data_vec[idx].as_ref().map(|o| o.inner).unwrap_or_default()
     })
diff --git a/rend3/src/renderer/eval.rs b/rend3/src/renderer/eval.rs
index 18c68d10..aca3a623 100644
--- a/rend3/src/renderer/eval.rs
+++ b/rend3/src/renderer/eval.rs
@@ -11,6 +11,8 @@ pub fn evaluate_instructions(renderer: &Renderer) -> InstructionEvaluationOutput
 
     let mut instructions = renderer.instructions.consumer.lock();
 
+    let delayed_object_handles = renderer.resource_handle_allocators.object.reclaim_delayed_handles();
+
     // 16 encoders is a reasonable default
     let mut cmd_bufs = Vec::with_capacity(16);
 
@@ -171,9 +173,12 @@ pub fn evaluate_instructions(renderer: &Renderer) -> InstructionEvaluationOutput
 
     // Do these in dependency order
     // Level 3
-    data_core
-        .object_manager
-        .evaluate(&renderer.device, &mut encoder, &renderer.scatter);
+    data_core.object_manager.evaluate(
+        &renderer.device,
+        &mut encoder,
+        &renderer.scatter,
+        &delayed_object_handles,
+    );
 
     // Level 2
     let d2_texture = data_core.d2_texture_manager.evaluate(&renderer.device);
diff --git a/rend3/src/renderer/mod.rs b/rend3/src/renderer/mod.rs
index d721bf21..68a9115a 100644
--- a/rend3/src/renderer/mod.rs
+++ b/rend3/src/renderer/mod.rs
@@ -65,7 +65,6 @@ pub struct Renderer {
 }
 
 /// Handle allocators
-#[derive(Default)]
 struct HandleAllocators {
     pub mesh: HandleAllocator<Mesh>,
     pub skeleton: HandleAllocator<Skeleton>,
@@ -77,6 +76,21 @@ struct HandleAllocators {
     pub graph_storage: HandleAllocator<GraphDataTag>,
 }
 
+impl Default for HandleAllocators {
+    fn default() -> Self {
+        Self {
+            mesh: HandleAllocator::new(false),
+            skeleton: HandleAllocator::new(false),
+            d2_texture: HandleAllocator::new(false),
+            d2c_texture: HandleAllocator::new(false),
+            material: HandleAllocator::new(false),
+            object: HandleAllocator::new(true),
+            directional_light: HandleAllocator::new(false),
+            graph_storage: HandleAllocator::new(false),
+        }
+    }
+}
+
 /// All the mutex protected data within the renderer
 pub struct RendererDataCore {
     /// Position and settings of the camera.
diff --git a/rend3/src/shader.rs b/rend3/src/shader.rs
index 279f35c5..829f5016 100644
--- a/rend3/src/shader.rs
+++ b/rend3/src/shader.rs
@@ -20,6 +20,7 @@ struct Rend3ShaderSources;
 #[derive(Debug, Default, Serialize)]
 pub struct ShaderConfig {
     pub profile: Option<RendererProfile>,
+    pub position_attribute_offset: usize,
 }
 
 pub struct ShaderVertexBufferConfig {
@@ -263,11 +264,10 @@ impl<'a> ShaderVertexBufferHelper<'a> {
         let unpack_function = format!(
             "
             fn unpack_vertex_index(vertex_index: u32) -> Indices {{
-                 let local_object_id = vertex_index >> 24u;
-                 let vertex_id = vertex_index & 0xFFFFFFu;
-                 let object_id = {batch_buffer}.ranges[local_object_id].object_id;
+                 let batch_indices = unpack_batch_index(vertex_index);
+                 let object_id = {batch_buffer}.object_culling_information[batch_indices.local_object].object_id;
                  
-                 return Indices(vertex_id, object_id);
+                 return Indices(object_id, batch_indices.vertex);
             }}"
         );
 
@@ -354,7 +354,10 @@ mod tests {
         let mut pp = ShaderPreProcessor::new();
         pp.add_shader("simple", "{{include \"other\"}} simple");
         pp.add_shader("other", "other");
-        let config = ShaderConfig { profile: None };
+        let config = ShaderConfig {
+            profile: None,
+            position_attribute_offset: 0,
+        };
         let output = pp.render_shader("simple", &config, None).unwrap();
 
         assert_eq!(output, "other simple");
@@ -365,7 +368,10 @@ mod tests {
         let mut pp = ShaderPreProcessor::new();
         pp.add_shader("simple", "{{include \"other\"}} simple");
         pp.add_shader("other", "{{include \"simple\"}} other");
-        let config = ShaderConfig { profile: None };
+        let config = ShaderConfig {
+            profile: None,
+            position_attribute_offset: 0,
+        };
         let output = pp.render_shader("simple", &config, None).unwrap();
 
         assert_eq!(output, " other simple");
@@ -375,7 +381,10 @@ mod tests {
     fn error_include() {
         let mut pp = ShaderPreProcessor::new();
         pp.add_shader("simple", "{{include \"other\"}} simple");
-        let config = ShaderConfig { profile: None };
+        let config = ShaderConfig {
+            profile: None,
+            position_attribute_offset: 0,
+        };
         let output = pp.render_shader("simple", &config, None);
 
         assert!(output.is_err(), "Expected error, got {output:?}");
@@ -385,7 +394,10 @@ mod tests {
     fn no_arg_include() {
         let mut pp = ShaderPreProcessor::new();
         pp.add_shader("simple", "{{include}} simple");
-        let config = ShaderConfig { profile: None };
+        let config = ShaderConfig {
+            profile: None,
+            position_attribute_offset: 0,
+        };
         let output = pp.render_shader("simple", &config, None);
 
         assert!(output.is_err(), "Expected error, got {output:?}");
diff --git a/rend3/src/util/bind_merge.rs b/rend3/src/util/bind_merge.rs
index 3258c2ad..97f1fd7f 100644
--- a/rend3/src/util/bind_merge.rs
+++ b/rend3/src/util/bind_merge.rs
@@ -96,6 +96,15 @@ impl<'a> BindGroupBuilder<'a> {
         self
     }
 
+    pub fn append_buffer_with_offset_and_size(&mut self, buffer: &'a Buffer, offset: u64, size: u64) -> &mut Self {
+        self.append(BindingResource::Buffer(BufferBinding {
+            buffer,
+            offset,
+            size: NonZeroU64::new(size),
+        }));
+        self
+    }
+
     pub fn append_sampler(&mut self, sampler: &'a Sampler) -> &mut Self {
         self.append(BindingResource::Sampler(sampler));
         self
diff --git a/rend3/src/util/buffer.rs b/rend3/src/util/buffer.rs
index bee9a28f..6613880b 100644
--- a/rend3/src/util/buffer.rs
+++ b/rend3/src/util/buffer.rs
@@ -1,6 +1,6 @@
 //! Automatic management of Power-of-Two sized buffers.
 
-use std::{marker::PhantomData, ops::Deref};
+use std::{marker::PhantomData, ops::Deref, sync::Arc};
 
 use encase::{private::WriteInto, ShaderType};
 use wgpu::{Buffer, BufferAddress, BufferDescriptor, BufferUsages, Device, Queue};
@@ -8,8 +8,9 @@ use wgpu::{Buffer, BufferAddress, BufferDescriptor, BufferUsages, Device, Queue}
 use crate::util::typedefs::SsoString;
 
 /// Creates, fills, and automatically resizes a power-of-two sized buffer.
+#[derive(Debug)]
 pub struct WrappedPotBuffer<T> {
-    inner: Buffer,
+    inner: Arc<Buffer>,
     size: BufferAddress,
     // This field is assumed to be a power of 2.
     minimum: BufferAddress,
@@ -30,12 +31,12 @@ where
         let usage = usage | BufferUsages::COPY_DST;
 
         Self {
-            inner: device.create_buffer(&BufferDescriptor {
+            inner: Arc::new(device.create_buffer(&BufferDescriptor {
                 label: Some(label),
                 size: minimum,
                 usage,
                 mapped_at_creation: false,
-            }),
+            })),
             size: minimum,
             minimum,
             usage,
@@ -48,12 +49,12 @@ where
         let resize = resize_po2(self.size, desired, self.minimum);
         if let Some(size) = resize {
             self.size = size;
-            self.inner = device.create_buffer(&BufferDescriptor {
+            self.inner = Arc::new(device.create_buffer(&BufferDescriptor {
                 label: Some(&self.label),
                 size,
                 usage: self.usage,
                 mapped_at_creation: false,
-            });
+            }));
         }
     }
 
diff --git a/rend3/src/util/freelist/buffer.rs b/rend3/src/util/freelist/buffer.rs
index efc695da..f9dd0b2e 100644
--- a/rend3/src/util/freelist/buffer.rs
+++ b/rend3/src/util/freelist/buffer.rs
@@ -5,11 +5,6 @@ use wgpu::{Buffer, BufferDescriptor, BufferUsages, CommandEncoder, Device};
 
 use crate::util::scatter_copy::{ScatterCopy, ScatterData};
 
-const STARTING_SIZE: usize = 16;
-const NEEDED_USAGES: BufferUsages = BufferUsages::STORAGE
-    .union(BufferUsages::COPY_DST)
-    .union(BufferUsages::COPY_SRC);
-
 pub struct FreelistDerivedBuffer {
     inner: Buffer,
 
@@ -21,6 +16,11 @@ pub struct FreelistDerivedBuffer {
     stale: Vec<usize>,
 }
 impl FreelistDerivedBuffer {
+    pub const STARTING_SIZE: usize = 16;
+    pub const NEEDED_USAGES: BufferUsages = BufferUsages::STORAGE
+        .union(BufferUsages::COPY_DST)
+        .union(BufferUsages::COPY_SRC);
+
     pub fn new<T>(device: &Device) -> Self
     where
         T: ShaderSize + WriteInto + 'static,
@@ -29,16 +29,16 @@ impl FreelistDerivedBuffer {
 
         let buffer = device.create_buffer(&BufferDescriptor {
             label: Some("freelist buffer"),
-            size: rounded_size * STARTING_SIZE as u64,
-            usage: NEEDED_USAGES,
+            size: rounded_size * Self::STARTING_SIZE as u64,
+            usage: Self::NEEDED_USAGES,
             mapped_at_creation: false,
         });
 
         Self {
             inner: buffer,
 
-            current_count: STARTING_SIZE,
-            reserved_count: STARTING_SIZE,
+            current_count: Self::STARTING_SIZE,
+            reserved_count: Self::STARTING_SIZE,
             rounded_size,
             stored_type: TypeId::of::<T>(),
 
@@ -70,7 +70,7 @@ impl FreelistDerivedBuffer {
             let new_buffer = device.create_buffer(&BufferDescriptor {
                 label: Some("freelist buffer"),
                 size: self.rounded_size * self.reserved_count as u64,
-                usage: NEEDED_USAGES,
+                usage: Self::NEEDED_USAGES,
                 mapped_at_creation: false,
             });
 
diff --git a/rend3/src/util/math.rs b/rend3/src/util/math.rs
index 484d7bcc..18638b86 100644
--- a/rend3/src/util/math.rs
+++ b/rend3/src/util/math.rs
@@ -2,13 +2,95 @@
 
 use num_traits::PrimInt;
 
-/// Rounds up `src` to the power of two `factor`.
-pub fn round_up<T: PrimInt>(src: T, factor: T) -> T {
-    let minus1 = factor - T::one();
-    ((src + minus1) / factor) * factor
+pub trait IntegerExt: PrimInt {
+    /// Rounds T away from zero to the nearest multiple of b.
+    ///
+    /// Panics if b is zero or negative.
+    fn round_up(self, b: Self) -> Self {
+        round_up(self, b)
+    }
+
+    /// Performs integer division between a and b rounding away from zero, instead of towards it.
+    ///
+    /// Panics if b is zero or negative.
+    fn div_round_up(self, b: Self) -> Self {
+        div_round_up(self, b)
+    }
+}
+
+impl<T: PrimInt> IntegerExt for T {}
+
+/// Rounds T away from zero to the nearest multiple of b.
+///
+/// Panics if b is zero or negative.
+pub fn round_up<T: PrimInt>(a: T, b: T) -> T {
+    assert!(b > T::zero(), "divisor must be non-zero and positive");
+    // All the negative infrastructure will compile away if T is unsigned as this is unconditionally false
+    let negative = a < T::zero();
+
+    let pos_a = if negative { T::zero() - a } else { a };
+
+    let rem = pos_a % b;
+    if rem == T::zero() {
+        return a;
+    }
+
+    let pos_res = pos_a + (b - rem);
+
+    if negative {
+        T::zero() - pos_res
+    } else {
+        pos_res
+    }
+}
+
+/// Performs integer division between a and b rounding away from zero, instead of towards it.
+///
+/// Panics if b is zero or negative.
+pub fn div_round_up<T: PrimInt>(a: T, b: T) -> T {
+    assert!(b > T::zero(), "divisor must be non-zero and positive");
+    // All the negative infrastructure will compile away if T is unsigned as this is unconditionally false
+    let negative = a < T::zero();
+
+    let pos_a = if negative { T::zero() - a } else { a };
+
+    let pos_res = (pos_a + (b - T::one())) / b;
+
+    if negative {
+        T::zero() - pos_res
+    } else {
+        pos_res
+    }
 }
 
-/// Performs integer division between a and b rounding up, instead of down
-pub fn round_up_div<T: PrimInt>(a: T, b: T) -> T {
-    (a + (b - T::one())) / b
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn round_up() {
+        assert_eq!(super::round_up(2, 12), 12);
+        assert_eq!(super::round_up(12, 12), 12);
+        assert_eq!(super::round_up(0, 12), 0);
+
+        // Negatives
+        assert_eq!(super::round_up(-14, 12), -24);
+        assert_eq!(super::round_up(-8, 12), -12);
+
+        // Identity
+        assert_eq!(super::round_up(2, 1), 2);
+    }
+
+    #[test]
+    fn round_up_div() {
+        assert_eq!(super::div_round_up(2, 12), 1);
+        assert_eq!(super::div_round_up(12, 12), 1);
+        assert_eq!(super::div_round_up(18, 12), 2);
+        assert_eq!(super::div_round_up(0, 12), 0);
+
+        // Negatives
+        assert_eq!(super::div_round_up(-14, 12), -2);
+        assert_eq!(super::div_round_up(-8, 12), -1);
+
+        // Identity
+        assert_eq!(super::div_round_up(2, 1), 2);
+    }
 }
diff --git a/rend3/src/util/scatter_copy.rs b/rend3/src/util/scatter_copy.rs
index 863bb076..006aa7bc 100644
--- a/rend3/src/util/scatter_copy.rs
+++ b/rend3/src/util/scatter_copy.rs
@@ -8,7 +8,7 @@ use wgpu::{
 
 use crate::util::{
     bind_merge::{BindGroupBuilder, BindGroupLayoutBuilder},
-    math::round_up_div,
+    math::div_round_up,
 };
 
 pub struct ScatterData<T> {
@@ -134,7 +134,7 @@ impl ScatterCopy {
         });
         cpass.set_pipeline(&self.pipeline);
         cpass.set_bind_group(0, &bg, &[]);
-        cpass.dispatch_workgroups(round_up_div(count_u32, 64), 1, 1);
+        cpass.dispatch_workgroups(div_round_up(count_u32, 64), 1, 1);
         drop(cpass);
     }
 }