mirror of
https://github.com/rust-embedded/rust-raspberrypi-OS-tutorials.git
synced 2024-11-03 15:40:21 +00:00
0D: Instruction caching and better benchmark function.
The previous benchmark function had a few flaws. First of all, it wasn't idiomatic Rust, because we used a loop construct that you would expect in C. Revamped that by using an iterator. Also, the previous benchmark got heavily optimized by the compiler, which unrolled the inner loop it into a huge sequence of consecutive loads and stores, resulting in lots of instructions that needed to be fetched from DRAM. Additionally, instruction caching was not turned on. The new code compiles into two tight loops, fully leveraging the power of the I and D caches, and providing an great showcase.
This commit is contained in:
parent
c65e2e56cd
commit
68de789d15
Binary file not shown.
Binary file not shown.
@ -219,8 +219,8 @@ pub unsafe fn init() {
|
||||
// First, force all previous changes to be seen before the MMU is enabled.
|
||||
barrier::isb(barrier::SY);
|
||||
|
||||
// Enable the MMU and turn on caching
|
||||
SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable);
|
||||
// Enable the MMU and turn on data and instruction caching.
|
||||
SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable + SCTLR_EL1::I::Cacheable);
|
||||
|
||||
// Force MMU init to complete before next instruction
|
||||
barrier::isb(barrier::SY);
|
||||
|
@ -13,7 +13,7 @@ performance.
|
||||
## Benchmark
|
||||
|
||||
Let's write a tiny, arbitrary micro-benchmark to showcase the performance of
|
||||
operating on the same DRAM with caching enabled and disabled.
|
||||
operating with data on the same DRAM with caching enabled and disabled.
|
||||
|
||||
### mmu.rs
|
||||
|
||||
@ -31,7 +31,7 @@ block). This time, the block is configured as cacheable.
|
||||
We write a little function that iteratively reads memory of five times the size
|
||||
of a `cacheline`, in steps of 8 bytes, aka one processor register at a time. We
|
||||
read the value, add 1, and write it back. This whole process is repeated
|
||||
`100_000` times.
|
||||
`20_000` times.
|
||||
|
||||
### main.rs
|
||||
|
||||
@ -46,12 +46,12 @@ On my Raspberry, I get the following results:
|
||||
|
||||
```text
|
||||
Benchmarking non-cacheable DRAM modifications at virtual 0x00200000, physical 0x00400000:
|
||||
664 miliseconds.
|
||||
1040 miliseconds.
|
||||
|
||||
Benchmarking cacheable DRAM modifications at virtual 0x00400000, physical 0x00400000:
|
||||
148 miliseconds.
|
||||
53 miliseconds.
|
||||
|
||||
With caching, the function is 348% faster!
|
||||
With caching, the function is 1862% faster!
|
||||
```
|
||||
|
||||
Impressive, isn't it?
|
||||
|
Binary file not shown.
Binary file not shown.
@ -3,26 +3,25 @@ use cortex_a::{barrier, regs::*};
|
||||
|
||||
/// We assume that addr is cacheline aligned
|
||||
pub fn batch_modify(addr: u64) -> u32 {
|
||||
const CACHELINE_SIZE_BYTES: u64 = 64; // TODO: retrieve this from a system register
|
||||
const NUM_CACHELINES_TOUCHED: u64 = 5;
|
||||
const BYTES_PER_U64_REG: usize = 8;
|
||||
const NUM_BENCH_ITERATIONS: u64 = 100_000;
|
||||
const CACHELINE_SIZE_BYTES: usize = 64; // TODO: retrieve this from a system register
|
||||
const NUM_CACHELINES_TOUCHED: usize = 5;
|
||||
const NUM_BENCH_ITERATIONS: usize = 20_000;
|
||||
|
||||
const NUM_BYTES_TOUCHED: u64 = CACHELINE_SIZE_BYTES * NUM_CACHELINES_TOUCHED;
|
||||
const NUM_BYTES_TOUCHED: usize = CACHELINE_SIZE_BYTES * NUM_CACHELINES_TOUCHED;
|
||||
|
||||
let mem = unsafe { core::slice::from_raw_parts_mut(addr as *mut u64, NUM_BYTES_TOUCHED) };
|
||||
|
||||
// Benchmark starts here
|
||||
let t1 = CNTPCT_EL0.get();
|
||||
|
||||
compiler_fence(Ordering::SeqCst);
|
||||
|
||||
let mut data_ptr: *mut u64;
|
||||
let mut temp: u64;
|
||||
for _ in 0..NUM_BENCH_ITERATIONS {
|
||||
for i in (addr..(addr + NUM_BYTES_TOUCHED)).step_by(BYTES_PER_U64_REG) {
|
||||
data_ptr = i as *mut u64;
|
||||
|
||||
for qword in mem.iter_mut() {
|
||||
unsafe {
|
||||
temp = core::ptr::read_volatile(data_ptr);
|
||||
core::ptr::write_volatile(data_ptr, temp + 1);
|
||||
temp = core::ptr::read_volatile(qword);
|
||||
core::ptr::write_volatile(qword, temp + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -211,8 +211,8 @@ pub unsafe fn init() {
|
||||
// First, force all previous changes to be seen before the MMU is enabled.
|
||||
barrier::isb(barrier::SY);
|
||||
|
||||
// Enable the MMU and turn on caching
|
||||
SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable);
|
||||
// Enable the MMU and turn on data and instruction caching.
|
||||
SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable + SCTLR_EL1::I::Cacheable);
|
||||
|
||||
// Force MMU init to complete before next instruction
|
||||
barrier::isb(barrier::SY);
|
||||
|
Loading…
Reference in New Issue
Block a user