finished instancing rewrite

pull/81/head
Ben Hansen 4 years ago
parent b543c16492
commit a5c6b5ad48

@ -3,9 +3,13 @@ use winit::{
event_loop::{EventLoop, ControlFlow},
window::{Window, WindowBuilder},
};
use cgmath::prelude::*;
mod texture;
const NUM_INSTANCES_PER_ROW: u32 = 10;
const INSTANCE_DISPLACEMENT: cgmath::Vector3<f32> = cgmath::Vector3::new(NUM_INSTANCES_PER_ROW as f32 * 0.5, 0.0, NUM_INSTANCES_PER_ROW as f32 * 0.5);
#[repr(C)]
#[derive(Copy, Clone, Debug)]
struct Vertex {
@ -52,12 +56,6 @@ const INDICES: &[u16] = &[
2, 3, 4,
];
// NEW!
const NUM_INSTANCES_PER_ROW: u32 = 10;
#[allow(dead_code)]
const NUM_INSTANCES: u32 = NUM_INSTANCES_PER_ROW * NUM_INSTANCES_PER_ROW;
const INSTANCE_DISPLACEMENT: cgmath::Vector3<f32> = cgmath::Vector3::new(NUM_INSTANCES_PER_ROW as f32 * 0.5, 0.0, NUM_INSTANCES_PER_ROW as f32 * 0.5);
#[cfg_attr(rustfmt, rustfmt_skip)]
pub const OPENGL_TO_WGPU_MATRIX: cgmath::Matrix4<f32> = cgmath::Matrix4::new(
1.0, 0.0, 0.0, 0.0,
@ -85,7 +83,7 @@ impl Camera {
}
#[repr(C)]
#[derive(Debug, Copy, Clone)]
#[derive(Copy, Clone)]
struct Uniforms {
view_proj: cgmath::Matrix4<f32>,
}
@ -95,7 +93,6 @@ unsafe impl bytemuck::Zeroable for Uniforms {}
impl Uniforms {
fn new() -> Self {
use cgmath::SquareMatrix;
Self {
view_proj: cgmath::Matrix4::identity(),
}
@ -173,34 +170,22 @@ impl CameraController {
}
fn update_camera(&self, camera: &mut Camera) {
use cgmath::InnerSpace;
let forward = camera.target - camera.eye;
let forward_norm = forward.normalize();
let forward_mag = forward.magnitude();
// Prevents glitching when camera gets too close to the
// center of the scene.
if self.is_forward_pressed && forward_mag > self.speed {
camera.eye += forward_norm * self.speed;
let forward = (camera.target - camera.eye).normalize();
if self.is_forward_pressed {
camera.eye += forward * self.speed;
}
if self.is_backward_pressed {
camera.eye -= forward_norm * self.speed;
camera.eye -= forward * self.speed;
}
let right = forward_norm.cross(camera.up);
// Redo radius calc in case the up/ down is pressed.
let forward = camera.target - camera.eye;
let forward_mag = forward.magnitude();
let right = forward.cross(camera.up);
if self.is_right_pressed {
// Rescale the distance between the target and eye so
// that it doesn't change. The eye therefore still
// lies on the circle made by the target and eye.
camera.eye = camera.target - (forward + right * self.speed).normalize() * forward_mag;
camera.eye += right * self.speed;
}
if self.is_left_pressed {
camera.eye = camera.target - (forward - right * self.speed).normalize() * forward_mag;
camera.eye -= right * self.speed;
}
}
}
@ -211,6 +196,7 @@ struct Instance {
rotation: cgmath::Quaternion<f32>,
}
// NEW!
impl Instance {
fn to_raw(&self) -> InstanceRaw {
InstanceRaw {
@ -219,9 +205,10 @@ impl Instance {
}
}
// NEW!
#[repr(C)]
#[derive(Copy, Clone)]
struct InstanceRaw {
#[allow(dead_code)]
model: cgmath::Matrix4<f32>,
}
@ -241,6 +228,7 @@ struct State {
index_buffer: wgpu::Buffer,
num_indices: u32,
#[allow(dead_code)]
diffuse_texture: texture::Texture,
diffuse_bind_group: wgpu::BindGroup,
@ -252,7 +240,6 @@ struct State {
size: winit::dpi::PhysicalSize<u32>,
// NEW!
instances: Vec<Instance>,
#[allow(dead_code)]
instance_buffer: wgpu::Buffer,
@ -263,7 +250,6 @@ impl State {
let size = window.inner_size();
let surface = wgpu::Surface::create(window);
let adapter = wgpu::Adapter::request(
&wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::Default,
@ -289,12 +275,7 @@ impl State {
let swap_chain = device.create_swap_chain(&surface, &sc_desc);
let diffuse_bytes = include_bytes!("happy-tree.png");
let (diffuse_texture, cmd_buffer) = texture::Texture::from_bytes(
&device,
diffuse_bytes,
"happy-tree.png"
).unwrap();
let (diffuse_texture, cmd_buffer) = texture::Texture::from_bytes(&device, diffuse_bytes, "happy-tree.png").unwrap();
queue.submit(&[cmd_buffer]);
let texture_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
@ -335,7 +316,7 @@ impl State {
});
let camera = Camera {
eye: (0.0, 1.0, 2.0).into(),
eye: (0.0, 5.0, 10.0).into(),
target: (0.0, 0.0, 0.0).into(),
up: cgmath::Vector3::unit_y(),
aspect: sc_desc.width as f32 / sc_desc.height as f32,
@ -353,6 +334,34 @@ impl State {
wgpu::BufferUsage::UNIFORM | wgpu::BufferUsage::COPY_DST,
);
let instances = (0..NUM_INSTANCES_PER_ROW).flat_map(|z| {
(0..NUM_INSTANCES_PER_ROW).map(move |x| {
let position = cgmath::Vector3 { x: x as f32, y: 0.0, z: z as f32 } - INSTANCE_DISPLACEMENT;
let rotation = if position.is_zero() {
// this is needed so an object at (0, 0, 0) won't get scaled to zero
// as Quaternions can effect scale if they're not created correctly
cgmath::Quaternion::from_axis_angle(cgmath::Vector3::unit_z(), cgmath::Deg(0.0))
} else {
cgmath::Quaternion::from_axis_angle(position.clone().normalize(), cgmath::Deg(45.0))
};
Instance {
position, rotation,
}
})
}).collect::<Vec<_>>();
let instance_data = instances.iter().map(Instance::to_raw).collect::<Vec<_>>();
// we'll need the size for later
let instance_buffer_size = instance_data.len() * std::mem::size_of::<cgmath::Matrix4<f32>>();
let instance_buffer = device.create_buffer_with_data(
bytemuck::cast_slice(&instance_data),
wgpu::BufferUsage::STORAGE_READ,
);
let uniform_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
bindings: &[
wgpu::BindGroupLayoutEntry {
@ -361,7 +370,18 @@ impl State {
ty: wgpu::BindingType::UniformBuffer {
dynamic: false,
},
}
},
// NEW!
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStage::VERTEX,
ty: wgpu::BindingType::StorageBuffer {
// We don't plan on changing the size of this buffer
dynamic: false,
// The shader is not allowed to modify it's contents
readonly: true,
},
},
],
label: Some("uniform_bind_group_layout"),
});
@ -375,7 +395,15 @@ impl State {
buffer: &uniform_buffer,
range: 0..std::mem::size_of_val(&uniforms) as wgpu::BufferAddress,
}
}
},
// NEW!
wgpu::Binding {
binding: 1,
resource: wgpu::BindingResource::Buffer {
buffer: &instance_buffer,
range: 0..instance_buffer_size as wgpu::BufferAddress,
}
},
],
label: Some("uniform_bind_group"),
});
@ -460,6 +488,9 @@ impl State {
uniform_bind_group,
uniforms,
size,
// NEW!
instances,
instance_buffer,
}
}
@ -527,7 +558,8 @@ impl State {
render_pass.set_bind_group(1, &self.uniform_bind_group, &[]);
render_pass.set_vertex_buffer(0, &self.vertex_buffer, 0, 0);
render_pass.set_index_buffer(&self.index_buffer, 0, 0);
render_pass.draw_indexed(0..self.num_indices, 0, 0..1);
// UPDATED!
render_pass.draw_indexed(0..self.num_indices, 0, 0..1000 as _);
}
self.queue.submit(&[

@ -3,8 +3,6 @@
layout(location=0) in vec3 a_position;
layout(location=1) in vec2 a_tex_coords;
layout(location=2) in mat4 a_model;
layout(location=0) out vec2 v_tex_coords;
layout(set=1, binding=0)
@ -12,7 +10,14 @@ uniform Uniforms {
mat4 u_view_proj;
};
// NEW!
layout(set=1, binding=1)
buffer Instances {
mat4 s_models[];
};
void main() {
v_tex_coords = a_tex_coords;
gl_Position = u_view_proj * a_model * vec4(a_position, 1.0);
// UPDATED!
gl_Position = u_view_proj * s_models[gl_InstanceIndex] * vec4(a_position, 1.0);
}

@ -62,8 +62,6 @@ pub const OPENGL_TO_WGPU_MATRIX: cgmath::Matrix4<f32> = cgmath::Matrix4::new(
);
const NUM_INSTANCES_PER_ROW: u32 = 10;
#[allow(dead_code)]
const NUM_INSTANCES: u32 = NUM_INSTANCES_PER_ROW * NUM_INSTANCES_PER_ROW;
const INSTANCE_DISPLACEMENT: cgmath::Vector3<f32> = cgmath::Vector3::new(NUM_INSTANCES_PER_ROW as f32 * 0.5, 0.0, NUM_INSTANCES_PER_ROW as f32 * 0.5);

@ -0,0 +1,28 @@
excerpt(radius) {
border-radius radius
-webkit-border-radius radius
-moz-border-radius radius
padding 0.5rem 1.5rem
}
.note {
background-color $noteColor
excerpt(6px)
}
.warning {
background-color $warningColor
color $warningTextColor
excerpt(6px)
code {
color: $warningInlineCodeColor
}
}
@media screen and (max-width $MQMobile) {
.note {
excerpt(0px)
margin 0.0rem -1.5rem
}
}

@ -0,0 +1,27 @@
// colors
$accentColor = darken(#abcdef, 20%)
$textColor = #2c3e50
$borderColor = #eaecef
$codeBgColor = #282c34
$arrowBgColor = #ccc
$badgeTipColor = #42b983
$badgeWarningColor = darken(#ffe564, 35%)
$badgeErrorColor = #DA5961
// layout
$navbarHeight = 3.6rem
$sidebarWidth = 20rem
$contentWidth = 740px
$homePageWidth = 960px
// responsive breakpoints
$MQNarrow = 959px
$MQMobile = 719px
$MQMobileNarrow = 419px
// custom
$noteColor = lighten($borderColor, 10%)
$warningColor = #ff6633
$warningTextColor = #333333
$warningInlineCodeColor = #eeeeee
$horizontalMargin = 1.5rem

@ -1,283 +1,125 @@
# Instancing
Up to this point we've been drawing just one object. Most games have hundreds of objects on screen at the same time. If we wanted to draw multiple instances of our model, we could copy the vertex buffer and modify it's vertices to be in the right place, but this would be hilariously inefficient. We have our model, and we now how to position it in 3d space with a matrix, like we did the camera, so all we have to do is change the matrix we're using when we draw.
Our scene right now is very simple: we have one object centered at (0,0,0). What if we wanted more objects? This is were instancing comes in.
## The naive method
Instancing allows use to draw the same object multiple times with different properties (position, orientation, size, color, etc.). There are multiple ways of doing instancing. One way would be to modify the uniform buffer to include these properties and then update it before we draw each instance of our object.
First let's modify `Uniforms` to include a `model` property.
We don't want to use this method for performance reasons. Updating the uniform buffer for each instance would require multiple buffer copies each frame. On top of that, our method to update the uniform buffer currently requires use to create a new buffer to store the updated data. That's a lot of time wasted between draw calls.
```rust
#[repr(C)]
#[derive(Debug, Copy, Clone)]
struct Uniforms {
view_proj: cgmath::Matrix4<f32>,
model: cgmath::Matrix4<f32>, // NEW!
}
unsafe impl bytemuck::Pod for Uniforms {}
unsafe impl bytemuck::Zeroable for Uniforms {}
impl Uniforms {
fn new() -> Self {
use cgmath::SquareMatrix;
Self {
view_proj: cgmath::Matrix4::identity(),
model: cgmath::Matrix4::identity(), // NEW!
}
}
fn update_view_proj(&mut self, camera: &Camera) {
self.view_proj = OPENGL_TO_WGPU_MATRIX * camera.build_view_projection_matrix();
}
}
```
With that let's introduce another struct for our instances. We'll use it to store the position and rotation of our instances. We'll also have a method to convert our instance data into a matrix that we can give to `Uniforms`.
If we look at the parameters for the `draw_indexed` function [in the wgpu docs](https://docs.rs/wgpu/0.5.2/wgpu/struct.RenderPass.html#method.draw_indexed), we can see a solution to our problem.
```rust
struct Instance {
position: cgmath::Vector3<f32>,
rotation: cgmath::Quaternion<f32>,
}
impl Instance {
fn to_matrix(&self) -> cgmath::Matrix4<f32> {
cgmath::Matrix4::from_translation(self.position)
* cgmath::Matrix4::from(self.rotation)
}
}
```
Next we'll add `instances: Vec<Instance>,` to `State` and create our instances with the following in `new()`.
```rust
// ...
// add these at the top of the file
const NUM_INSTANCES_PER_ROW: u32 = 10;
const NUM_INSTANCES: u32 = NUM_INSTANCES_PER_ROW * NUM_INSTANCES_PER_ROW;
const INSTANCE_DISPLACEMENT: cgmath::Vector3<f32> = cgmath::Vector3::new(NUM_INSTANCES_PER_ROW as f32 * 0.5, 0.0, NUM_INSTANCES_PER_ROW as f32 * 0.5);
// make a 10 by 10 grid of objects
let instances = (0..NUM_INSTANCES_PER_ROW).flat_map(|z| {
(0..NUM_INSTANCES_PER_ROW).map(move |x| {
let position = cgmath::Vector3 { x: x as f32, y: 0.0, z: z as f32 } - INSTANCE_DISPLACEMENT;
let rotation = if position.is_zero() {
// this is needed so an object at (0, 0, 0) won't get scaled to zero
// as Quaternions can effect scale if they're not create correctly
cgmath::Quaternion::from_axis_angle(cgmath::Vector3::unit_z(), cgmath::Deg(0.0))
} else {
cgmath::Quaternion::from_axis_angle(position.clone().normalize(), cgmath::Deg(45.0))
};
Instance {
position, rotation,
}
})
}).collect();
// ...
Self {
// ...
instances,
}
pub fn draw_indexed(
&mut self,
indices: Range<u32>,
base_vertex: i32,
instances: Range<u32> // <-- This right here
)
```
Now that that's done, we need to update `shader.vert` to use the model matrix passed in through `Uniforms`.
```glsl
#version 450
layout(location=0) in vec3 a_position;
layout(location=1) in vec2 a_tex_coords;
The `instances` parameter takes a `Range<u32>`. This parameter tells the GPU how many copies, or instances, of our model we want to draw. Currently we are specifying `0..1`, which the GPU will draw our model once, and then it will stop. If we used `0..5`, our code would draw 5 instances.
layout(location=0) out vec2 v_tex_coords;
The fact that `instances` is a `Range<u32>` may seem weird as using `1..2` for instances would still draw 1 instance of our object. Seems like it would be simpler to just use a `u32` right? The reason it's a range is because sometimes we don't want to draw **all** of our objects. Sometimes we want to draw a selection of them, because others are not in frame, our we are debugging and want to look at a particular set of instances.
layout(set=1, binding=0)
uniform Uniforms {
mat4 u_view_proj;
mat4 u_model; // NEW!
};
Ok, now we know how to draw multiple instances of an object, how do we tell wgpu what particular instance to draw? We are going to use something known as an instance buffer.
void main() {
v_tex_coords = a_tex_coords;
gl_Position = u_view_proj * u_model * vec4(a_position, 1.0); // UPDATED!
}
```
## The Instance Buffer
If you run the program now, you won't see anything different. That's because we aren't actually updating the uniform buffer at all. Using our current method, we need to update the uniform buffer for every instance we draw. We'll do this in `render()` with something like the following.
We'll create an instance buffer in a similar way to how we create a uniform buffer. First we'll create a struct called `Instance`.
```rust
for instance in &self.instances {
// 1.
self.uniforms.model = instance.to_matrix();
let staging_buffer = self.device.create_buffer_with_data(
bytemuck::cast_slice(&[self.uniforms]),
wgpu::BufferUsage::COPY_SRC,
);
encoder.copy_buffer_to_buffer(&staging_buffer, 0, &self.uniform_buffer, 0, std::mem::size_of::<Uniforms>() as wgpu::BufferAddress);
// 2.
let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
color_attachments: &[
wgpu::RenderPassColorAttachmentDescriptor {
attachment: &frame.view,
resolve_target: None,
load_op: wgpu::LoadOp::Load, // 3.
store_op: wgpu::StoreOp::Store,
clear_color: wgpu::Color {
r: 0.1,
g: 0.2,
b: 0.3,
a: 1.0,
},
}
],
depth_stencil_attachment: None,
});
render_pass.set_pipeline(&self.render_pipeline);
render_pass.set_bind_group(0, &self.diffuse_bind_group, &[]);
render_pass.set_bind_group(1, &self.uniform_bind_group, &[]);
render_pass.set_vertex_buffer(0, &self.vertex_buffer, 0, 0);
render_pass.set_index_buffer(&self.index_buffer, 0, 0);
render_pass.draw_indexed(0..self.num_indices, 0, 0..1);
}
```
Some things to note:
1. We're creating a hundred buffers a frame. This is inefficent, but we'll cover better ways of doing this later in this tutorial.
2. We have to create a new render pass per instance, as we can't modify the uniform buffer while we have one active.
3. We use `LoadOp::Load` here to prevent the render pass from clearing the entire screen after each draw. This means we lose our clear color. This makes the background black on my machine, but it may be filled with garbage data on yours. We can fix this by added another render pass before the loop.
// main.rs
// ...
```rust
{
encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
color_attachments: &[
wgpu::RenderPassColorAttachmentDescriptor {
attachment: &frame.view,
resolve_target: None,
load_op: wgpu::LoadOp::Clear,
store_op: wgpu::StoreOp::Store,
clear_color: wgpu::Color {
r: 0.1,
g: 0.2,
b: 0.3,
a: 1.0,
},
}
],
depth_stencil_attachment: None,
});
// NEW!
struct Instance {
position: cgmath::Vector3<f32>,
rotation: cgmath::Quaternion<f32>,
}
```
We should get something that looks like this when we're done.
<div class="note">
![A beautiful forest](./forest.png)
A `Quaternion` is a mathematical structure often used to represent rotation. The math behind them is beyond me (it involves imaginary numbers and 4D space) so I won't be covering them here. If you really want to dive into them [here's a Wolfram Alpha article](https://mathworld.wolfram.com/Quaternion.html).
If you haven't guessed already, this way of instancing is not the best. It requires hundreds of render passes, hundereds of staging buffers, and an extra render pass just to get the clear color working again. Cleary there must be a better way.
</div>
## A better way - uniform arrays
Since GLSL is based on C, it supports arrays. We can leverage this by store *all* of the instance matrices in the `Uniforms` struct. We need to make this change on the Rust side, as well as in our shader.
Using these values directly in the shader would be a pain as quaternions don't have a GLSL analog. I don't feel like writing the math in the shader, so we'll convert the `Instance` data into a matrix and store it into a struct called `InstanceRaw`.
```rust
// NEW!
#[repr(C)]
#[derive(Copy, Clone)]
struct Uniforms {
view_proj: cgmath::Matrix4<f32>,
model: [cgmath::Matrix4<f32>; NUM_INSTANCES as usize],
struct InstanceRaw {
model: cgmath::Matrix4<f32>,
}
```
```glsl
#version 450
layout(location=0) in vec3 a_position;
layout(location=1) in vec2 a_tex_coords;
layout(location=0) out vec2 v_tex_coords;
layout(set=1, binding=0)
uniform Uniforms {
mat4 u_view_proj;
mat4 u_model[100];
};
void main() {
v_tex_coords = a_tex_coords;
// gl_InstanceIndex what index we're currently on
gl_Position = u_view_proj * u_model[gl_InstanceIndex] * vec4(a_position, 1.0);
}
unsafe impl bytemuck::Pod for InstanceRaw {}
unsafe impl bytemuck::Zeroable for InstanceRaw {}
```
Note that we're using an array, *not a `Vec`*. `Vec`s are basically pointers to an array on the heap. Our graphics card doesn't know how follow a pointer to the heap, so our data needs to be stored inline.
This is the data to will go into the `wgpu::Buffer`. We keep these separate so that we can update the `Instance` as much as we want without needing to mess with matrices. We only need to update the raw data before we draw.
`Uniforms::new()` will change slightly as well.
Let's create a method on `Instance` to convert to `InstanceRaw`.
```rust
fn new() -> Self {
Self {
view_proj: cgmath::Matrix4::identity(),
model: [cgmath::Matrix4::identity(); NUM_INSTANCES as usize],
// NEW!
impl Instance {
fn to_raw(&self) -> InstanceRaw {
InstanceRaw {
model: cgmath::Matrix4::from_translation(self.position) * cgmath::Matrix4::from(self.rotation),
}
}
}
```
We need to update our model matrices in `State::update()` before we create the `staging_buffer`.
Now we need to add 2 fields to `State`: `instances`, and `instance_buffer`.
```rust
for (i, instance) in self.instances.iter().enumerate() {
self.uniforms.model[i] = instance.to_matrix();
struct State {
instances: Vec<Instance>,
#[allow(dead_code)]
instance_buffer: wgpu::Buffer,
}
```
Lastly we need to change our render code. Fortunately, it's a lot simpler than the before. In fact we can use the code from last tutorial and just change our draw call.
We'll create the instances in `new()`. We'll use some constants to simplify things. We'll display our instances in 10 rows of 10, and they'll be spaced evenly apart.
```rust
render_pass.draw_indexed(0..self.num_indices, 0, 0..NUM_INSTANCES);
const NUM_INSTANCES_PER_ROW: u32 = 10;
const NUM_INSTANCES: u32 = NUM_INSTANCES_PER_ROW * NUM_INSTANCES_PER_ROW;
const INSTANCE_DISPLACEMENT: cgmath::Vector3<f32> = cgmath::Vector3::new(NUM_INSTANCES_PER_ROW as f32 * 0.5, 0.0, NUM_INSTANCES_PER_ROW as f32 * 0.5);
```
You'll remember that the 3rd parameter in `draw_indexed` is the instance range. This controls how many times our object will be drawn. This is where our shader gets the value for `gl_InstanceIndex`.
Running the program now won't change anything visually from our last example, but the framerate will be better.
This technique has its drawbacks.
1. We can't use a `Vec` like we've mentioned before
2. We're limited in the number of instances we can process at a time requiring use to cap it at some abitrary number, or render things in "batches". If we want to increase the size of instances, we have to recompile our code.
## Another better way - storage buffers
A storage buffer gives us the flexibility that arrays did not. We don't have to specify it's size in the shader, and we can even use a `Vec` to create it!
Since we're using `bytemuck` for casting our data to `&[u8]`, we're going to need to define a custom scruct to store the `cgmath::Matrix4`s. We need to do this because we can't implement `bytemuck::Pod`, and `bytemuck::Zeroable`, on `cgmath::Matrix4` because we don't that type.
Now we can create the actual instances.
```rust
// UPDATED!
impl Instance {
// This is changed from `to_matrix()`
fn to_raw(&self) -> InstanceRaw {
InstanceRaw {
model: cgmath::Matrix4::from_translation(self.position) * cgmath::Matrix4::from(self.rotation),
}
impl State {
async fn new(window: &Window) -> Self {
// ...
let instances = (0..NUM_INSTANCES_PER_ROW).flat_map(|z| {
(0..NUM_INSTANCES_PER_ROW).map(move |x| {
let position = cgmath::Vector3 { x: x as f32, y: 0.0, z: z as f32 } - INSTANCE_DISPLACEMENT;
let rotation = if position.is_zero() {
// this is needed so an object at (0, 0, 0) won't get scaled to zero
// as Quaternions can effect scale if they're not created correctly
cgmath::Quaternion::from_axis_angle(cgmath::Vector3::unit_z(), cgmath::Deg(0.0))
} else {
cgmath::Quaternion::from_axis_angle(position.clone().normalize(), cgmath::Deg(45.0))
};
Instance {
position, rotation,
}
})
}).collect::<Vec<_>>();
// ...
}
}
// NEW!
#[repr(C)]
#[derive(Copy, Clone)]
struct InstanceRaw {
model: cgmath::Matrix4<f32>,
}
unsafe impl bytemuck::Pod for InstanceRaw {}
unsafe impl bytemuck::Zeroable for InstanceRaw {}
```
We create a storage buffer in a similar way as any other buffer.
Now that we have our data, we can create the actual `instance_buffer`.
```rust
let instance_data = instances.iter().map(Instance::to_raw).collect::<Vec<_>>();
@ -289,28 +131,32 @@ let instance_buffer = device.create_buffer_with_data(
);
```
To get this buffer into the shader, we'll need to attach it to a bind group. We'll use `uniform_bind_group` just to keep things simple.
We need a way to bind our new instance buffer so we can use it in the vertex shader. We could create a new bind group (and we probably should), but for simplicity, I'm going to add a binding to the `uniform_bind_group` that references our `instance_buffer`.
```rust
let uniform_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
bindings: &[
// ...
// NEW!
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStage::VERTEX,
ty: wgpu::BindingType::StorageBuffer {
// We don't plan on changing the size of this buffer
dynamic: false,
// The shader is not allowed to modify it's contents
readonly: true,
},
},
],
// ...
label: Some("uniform_bind_group_layout"),
});
let uniform_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
layout: &uniform_bind_group_layout,
bindings: &[
// ...
// NEW!
wgpu::Binding {
binding: 1,
resource: wgpu::BindingResource::Buffer {
@ -319,414 +165,73 @@ let uniform_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
}
},
],
label: Some("uniform_bind_group"),
});
```
*Note you'll probably need to shift your `instance_buffer` creation above the `uniform_bind_group` creation.*
We'll want to put `instance_buffer` into the `State` struct.
You don't need to change the draw call at all from the previous example, but we'll need to change the vertex shader.
```glsl
#version 450
layout(location=0) in vec3 a_position;
layout(location=1) in vec2 a_tex_coords;
layout(location=0) out vec2 v_tex_coords;
layout(set=1, binding=0)
uniform Uniforms {
mat4 u_view_proj;
};
layout(set=1, binding=1)
buffer Instances {
mat4 s_models[];
};
void main() {
v_tex_coords = a_tex_coords;
gl_Position = u_view_proj * s_models[gl_InstanceIndex] * vec4(a_position, 1.0);
}
```
You can see that we got rid of the `u_model` field from the `Uniforms` block and create a new `Instances` located at `set=1, binding=1` corresponding with our bind group layout. Another thing to notice is that we use the `buffer` keyword for the block instead of `uniform`. The details of the `buffer` can be found on [the OpenGL wiki](https://www.khronos.org/opengl/wiki/Shader_Storage_Buffer_Object).
This method is nice because it allows us to store more data overall as storage buffers can theoretically store as much data as the GPU can handle, where uniform buffers are capped. This does mean that storage buffers are slower that uniform buffers as they are stored like other buffers such as textures as and therefore aren't as close in memory, but that usually won't matter much if you're dealing with large amounts of data.
Another benefit to storage buffers is that they can be written to by the shader, unlike uniform buffers. If we want to mutate a large amount of data with a compute shader, we'd use a writeable storage buffer for our output (and potentially input as well).
## Another better way - vertex buffers
When we created the `VertexBufferDescriptor` for our model, it required a `step_mode` field. We used `InputStepMode::Vertex`, this time we'll create a `VertexBufferDescriptor` for our `instance_buffer`.
We'll take the code from the previous example and then create a trait called `VBDesc`, and implement it for `Vertex` (replacing the old `impl`), and a newly created `InstanceRaw` class. *Note: we could just `impl VBDesc for cgmath::Matrix4<f32>` instead, but instances could have more data in the future, so it's better to create a new struct.*
Here's our new trait.
```rust
trait VBDesc {
fn desc<'a>() -> wgpu::VertexBufferDescriptor<'a>;
}
```
To change `Vertex` to use this, we just have to swap `impl Vertex`, for `impl VBDesc for Vertex`.
With that done we can implement `VBDesc` for `InstanceRaw`.
Don't forget to return our new variables!
```rust
const FLOAT_SIZE: wgpu::BufferAddress = std::mem::size_of::<f32>() as wgpu::BufferAddress;
impl VBDesc for InstanceRaw {
fn desc<'a>() -> wgpu::VertexBufferDescriptor<'a> {
wgpu::VertexBufferDescriptor {
stride: std::mem::size_of::<InstanceRaw>() as wgpu::BufferAddress,
step_mode: wgpu::InputStepMode::Instance, // 1.
attributes: &[
wgpu::VertexAttributeDescriptor {
offset: 0,
format: wgpu::VertexFormat::Float4, // 2.
shader_location: 2, // 3.
},
wgpu::VertexAttributeDescriptor {
offset: FLOAT_SIZE * 4,
format: wgpu::VertexFormat::Float4,
shader_location: 3,
},
wgpu::VertexAttributeDescriptor {
offset: FLOAT_SIZE * 4 * 2,
format: wgpu::VertexFormat::Float4,
shader_location: 4,
},
wgpu::VertexAttributeDescriptor {
offset: FLOAT_SIZE * 4 * 3,
format: wgpu::VertexFormat::Float4,
shader_location: 5,
},
]
}
}
}
```
Let's unpack this a bit.
1. This line makes what would be a vertex buffer into and index buffer. If we didn't specify this, the shader would loop through the elements in this list for every vertex.
2. Vertex attributes have a limited size: `Float4` or the equivalent. This means that our instance buffer will take up multiple attribute slots. 4 in our case.
3. Since we're using 2 slots for our `Vertex` struct, we need to start the `shader_location` at 2.
Now we need to add a `VertexBufferDescriptor` to our `render_pipeline`.
```rust
let render_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
// ...
vertex_state: wgpu::VertexStateDescriptor {
index_format: wgpu::IndexFormat::Uint16,
vertex_buffers: &[
Vertex::desc(),
InstanceRaw::desc(), // NEW!
],
},
Self {
// ...
});
```
*You'll probably want to remove the `BindGroupLayoutBinding` and `Binding` from `uniform_bind_group_layout` and `uniform_bind_group` respectively, as we won't be accessing our buffer from there.*
We'll also want to change `instance_buffer`'s `BufferUsage` to `VERTEX`.
```rust
let instance_data = instances.iter().map(Instance::to_raw).collect::<Vec<_>>();
let instance_buffer = device.create_buffer_with_data(
bytemuck::cast_slice(&instance_data),
wgpu::BufferUsage::VERTEX,
);
```
This last thing we'll need to do from Rust is use our `instance_buffer` in the `render()` method.
```rust
render_pass.set_vertex_buffer(0, &self.vertex_buffer, 0, 0);
render_pass.set_vertex_buffer(1, &self.instance_buffer, 0, 0); // NEW!
```
Now we get to the shader. We don't have to change much, we just make our shader reference our `instance_buffer` through the attributes rather than a uniform/buffer block.
```glsl
#version 450
layout(location=0) in vec3 a_position;
layout(location=1) in vec2 a_tex_coords;
layout(location=2) in mat4 a_model; // NEW!
layout(location=0) out vec2 v_tex_coords;
layout(set=1, binding=0)
uniform Uniforms {
mat4 u_view_proj;
};
void main() {
v_tex_coords = a_tex_coords;
gl_Position = u_view_proj * a_model * vec4(a_position, 1.0); // UPDATED!
// NEW!
instances,
instance_buffer,
}
```
That's all you need to get an instance buffer working! There's a bit of overhead to get things working, and there are a few quirks, but it gets the job.
## A different way - textures
This seems like a really backwards way to do instancing. Storing non image data in a texture seems really bizarre even though it's a perfectly valid thing to do. After all, a texture is just an array of bytes, and that could theoretically be anything. In our case, we're going to cram our matrix data into that array of bytes.
If you're following along, it'd be best to start from the storage buffer example. We're going to modify it to take our `instance_buffer`, and copy it into a 1D `instance_texture`. First we need to create the texture.
* We won't use our `texture` module for this, though we could refactor it to store random data as a texture.
```rust
let instance_extent = wgpu::Extent3d {
width: instance_data.len() as u32 * 4,
height: 1,
depth: 1,
};
let instance_texture = device.create_texture(&wgpu::TextureDescriptor {
size: instance_extent,
array_layer_count: 1,
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D1,
format: wgpu::TextureFormat::Rgba32Float,
usage: wgpu::TextureUsage::SAMPLED | wgpu::TextureUsage::COPY_DST,
label: Some("instance_texture"),
});
```
All of this is fairly normal texture creation stuff, save two things:
1. We specify the height of the texture as 1. While you could theoretically use a height greater than 1, keeping the texture 1D simplifies things a bit. This also means that we need to use `TextureDimension::D1` for our `dimension`.
2. We're using `TextureFormat::Rgba32Float` for the texture format. Since our matrices are 32bit floats, this makes sense. We could use lower memory formats such as `Rgba16Float`, or even `Rgba8UnormSrgb`, but we loose precision when we do that. We might not need that precision for basic rendering, but applications that need to model reality definetly do.
With that said, let's create our texture view and sampler for our `instance_texture`.
The last change we need to make is in the `render()` method. We need to change the range we're using in `draw_indexed()` to include use the number of instances.
```rust
let instance_texture_view = instance_texture.create_default_view();
let instance_sampler = device.create_sampler(&wgpu::SamplerDescriptor {
address_mode_u: wgpu::AddressMode::ClampToEdge,
address_mode_v: wgpu::AddressMode::ClampToEdge,
address_mode_w: wgpu::AddressMode::ClampToEdge,
mag_filter: wgpu::FilterMode::Nearest,
min_filter: wgpu::FilterMode::Nearest,
mipmap_filter: wgpu::FilterMode::Nearest,
lod_min_clamp: -100.0,
lod_max_clamp: 100.0,
compare: wgpu::CompareFunction::Always,
});
render_pass.set_pipeline(&self.render_pipeline);
render_pass.set_bind_group(0, &self.diffuse_bind_group, &[]);
render_pass.set_bind_group(1, &self.uniform_bind_group, &[]);
render_pass.set_vertex_buffer(0, &self.vertex_buffer, 0, 0);
render_pass.set_index_buffer(&self.index_buffer, 0, 0);
// UPDATED!
render_pass.draw_indexed(0..self.num_indices, 0, 0..self.instances.len() as _);
```
Then we need to copy the `instance_buffer` to our `instance_texture`.
<div class="warning">
```rust
let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("instance_texture_encoder"),
});
encoder.copy_buffer_to_texture(
wgpu::BufferCopyView {
buffer: &instance_buffer,
offset: 0,
bytes_per_row: std::mem::size_of::<f32>() as u32 * 4,
rows_per_image: instance_data.len() as u32 * 4,
},
wgpu::TextureCopyView {
texture: &instance_texture,
mip_level: 0,
array_layer: 0,
origin: wgpu::Origin3d::ZERO,
},
instance_extent,
);
queue.submit(&[encoder.finish()]);
```
Make sure if you add new instances to the `Vec` that you recreate the `instance_buffer` and as well as `uniform_bind_group`, otherwise you're new instances won't show up correctly.
Now we need to add our texture and sampler to a bind group. Let with the storage buffer example, we'll use `uniform_bind_group` and its corresponding layout.
</div>
```rust
let uniform_bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
bindings: &[
// ...
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStage::VERTEX,
ty: wgpu::BindingType::SampledTexture {
multisampled: false,
component_type: wgpu::TextureComponentType::Uint,
dimension: wgpu::TextureViewDimension::D1,
}
},
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStage::VERTEX,
ty: wgpu::BindingType::Sampler { comparison: false },
},
],
// ...
});
let uniform_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
layout: &uniform_bind_group_layout,
bindings: &[
// ...
wgpu::Binding {
binding: 1,
resource: wgpu::BindingResource::TextureView(&instance_texture_view),
},
wgpu::Binding {
binding: 2,
resource: wgpu::BindingResource::Sampler(&instance_sampler),
},
],
// ...
});
```
## Storage Buffers
With all that done we can now move onto the vertex shader. Let's start with the new uniforms. *Don't forget to delete the old `buffer` block.*
When we modified `uniform_bind_group_layout`, we specified that our `instance_buffer` would be of type `wgpu::BindingType::StorageBuffer`. A storage buffer functions like an array that persists between shader invocations. Let's take a look at what it looks like in `shader.vert`.
```glsl
// we use a texture1D instead of texture2d because our texture is 1D
layout(set = 1, binding = 1) uniform texture1D t_model;
layout(set = 1, binding = 2) uniform sampler s_model;
layout(set=1, binding=1)
buffer Instances {
mat4 s_models[];
};
```
The next part is a little more intensive, as there's now built in way to process our texture data as matrix data. We'll have to write a function to do that.
```glsl
mat4 get_matrix(int index) {
return mat4(
texelFetch(sampler1D(t_model, s_model), index * 4, 0),
texelFetch(sampler1D(t_model, s_model), index * 4 + 1, 0),
texelFetch(sampler1D(t_model, s_model), index * 4 + 2, 0),
texelFetch(sampler1D(t_model, s_model), index * 4 + 3, 0)
);
}
```
We declare a storage buffer in a very similar way to how we declare a uniform block. The only real difference is that we use the `buffer` keyword. We can then use `s_models` to position our models in the scene. But how do we know what instance to use?
This function takes in the index of the instance of the model we are rendering, and pulls our 4 pixels from the image corresponding to to 4 sets of floats that make up that instance's matrix. It then packs them into a `mat4` and returns that.
## gl_InstanceIndex
Now we need to change our `main()` function to use `get_matrix()`.
This GLSL variable let's use specify what instance we want to use. We can use the `gl_InstanceIndex` to index our `s_models` buffer to get the matrix for the current model.
```glsl
void main() {
v_tex_coords = a_tex_coords;
mat4 transform = get_matrix(gl_InstanceIndex);
gl_Position = u_view_proj * transform * vec4(a_position, 1.0);
// UPDATED!
gl_Position = u_view_proj * s_models[gl_InstanceIndex] * vec4(a_position, 1.0);
}
```
There's a couple of things we need to do before this method will work. First `instance_buffer` we'll need to be `BufferUsage::COPY_SRC`.
<div class="note">
```rust
let instance_buffer = device.create_buffer_with_data(
bytemuck::cast_slice(&instance_data),
wgpu::BufferUsage::COPY_SRC, // UPDATED!
);
```
The value of `gl_InstanceIndex` is based on the range passed to the `instances` parameter of `draw_indexed`. Using `3..instances.len() as _` would mean that the 1st-3rd instances would be skipped.
You'll need is to remove `InstanceRaw::desc()` from the `render_pipeline`'s `vertex_state`.
</div>
```rust
let render_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
// ...
vertex_state: wgpu::VertexStateDescriptor {
index_format: wgpu::IndexFormat::Uint16,
vertex_buffers: &[
Vertex::desc(),
],
},
// ...
});
```
Lastly you'll want to store `instance_texture` in `State` to prevent it from being disposed of.
```rust
// new()
Self {
// ...
instance_texture,
}
```
With all that done, we should have a forest of trees!
That's a lot more work than the other method's, but it's still good to know that you can use textures store things other then color. This technique does come in handy when other solutions are not available, or not as performant. It's good to be aware of the possibilities!
For fun, here's what our matrix data looks like when converted into a texture (scaled up 10 times)!
![lots of colorful squares](./instance_texture_scaled.png)
## Recap
<table style="width:100%">
<tr>
<th>Technique</th>
<th>Pros</th>
<th>Cons</th>
</tr>
<tr>
<td>Naive Approach</td>
<td><ul><li>Super simple</li></ul></td>
<td><ul><li>Super slow with lots of instances</li></ul></td>
</tr>
<tr>
<td>Uniform Buffer</td>
<td><ul>
<li>Quicker then other techniques</li>
</ul></td>
<td><ul>
<li>Requires using fixed size array</li>
<li>Limited size</li>
<li>Requires a bind group</li>
</ul></td>
</tr>
<tr>
<td>Storage Buffer</td>
<td><ul>
<li>Larger size</li>
<li>Allows modifying data</li>
<li>We can use <code>Vec</code></li>
</ul></td>
<td><ul>
<li>Slower than uniform buffers</li>
<li>Requires a bind group</li>
</ul></td>
</tr>
<tr>
<td>Instance Buffer</td>
<td><ul>
<li>Larger size</li>
<li>Doesn't need <code>gl_InstanceIndex</code></li>
</ul></td>
<td><ul>
<li>Requires <code>VertexBufferDescriptor</code></li>
<li>Requires passing in the vertex buffer to the render pass</li>
<li>Vertex attributes are limited in size (4 floats)</li>
</ul></td>
</tr>
<tr>
<td>Textures</td>
<td><ul>
<li>Universally supported</li>
<li>Faster than naive approach</li>
</ul></td>
<td><ul>
<li>Requires decoding data manually</li>
<li>Limited to by pixel format</li>
<li>Requires a copying data to the texture via buffer</li>
<li>Requires a bind group</li>
</ul></td>
</tr>
</table>
## About the depth issues...
You may have noticed that some of the back pentagons are rendering in front of the ones in the front. This is a draw order issue. We could solve this by sorting the instances from back to front, that would only work from certain camera angles. A more flexible approach would be to use a *depth buffer*. We'll talk about those [next time](/todo).
![./forest.png](./forest.png)
## Challenge

Loading…
Cancel
Save