From 68de789d15555514aa92bf83f3f53a3b43fbf34f Mon Sep 17 00:00:00 2001 From: Andre Richter Date: Tue, 2 Oct 2018 22:59:27 +0200 Subject: [PATCH] 0D: Instruction caching and better benchmark function. The previous benchmark function had a few flaws. First of all, it wasn't idiomatic Rust, because we used a loop construct that you would expect in C. Revamped that by using an iterator. Also, the previous benchmark got heavily optimized by the compiler, which unrolled the inner loop it into a huge sequence of consecutive loads and stores, resulting in lots of instructions that needed to be fetched from DRAM. Additionally, instruction caching was not turned on. The new code compiles into two tight loops, fully leveraging the power of the I and D caches, and providing an great showcase. --- 0C_virtual_memory/kernel8 | Bin 73056 -> 73056 bytes 0C_virtual_memory/kernel8.img | Bin 2572 -> 2572 bytes 0C_virtual_memory/src/mmu.rs | 4 ++-- 0D_cache_performance/README.md | 10 +++++----- 0D_cache_performance/kernel8 | Bin 79448 -> 79496 bytes 0D_cache_performance/kernel8.img | Bin 6912 -> 6016 bytes 0D_cache_performance/src/benchmark.rs | 21 ++++++++++----------- 0D_cache_performance/src/mmu.rs | 4 ++-- 8 files changed, 19 insertions(+), 20 deletions(-) diff --git a/0C_virtual_memory/kernel8 b/0C_virtual_memory/kernel8 index 5fab5fd377f738ebfc4fd9004d40dee6b64494ea..727ac4099686cd6c01d27ee380b2e1aa883bff88 100755 GIT binary patch delta 202 zcmaE`i{-&CmWC~iMU0G1+lv_)Q~4P?w%-(B{LaXzJpG~&qc~&W_LoA8>lr0au`ntBTCm{d; delta 200 zcmaE`i{-&CmWC~iMU0FM+lv_)Q~4R&x8D?C{LaXzIQ^m!qd24g_LoA8>lr0aurM$P zF)%XR;NV~Y0SL*!P&a*}2&1@UGb=U-QPL) zeB68PZ7*GOzxuoT^%#ztjcvYk2<(v#4XrcZ)*Lwq|#m7ChiGRue_IzI0yf0uj@R~csi5Z#tbh-JIM>@Jm&{{a=Dow0a z&iZxpeU$pidwO4DrP2@TGMK9_ahdcJuDTG?)XFw~R z+(AV&(!2wGlbAR1s|Pw<7vr>`ggz5xkX(x1Grn!{(+`G;9u9Z+_$+z!}xhmu5 zRM5j5v>Xy&%Stl!S9Ei{SSEUEN}{vXnkj`0EB$hpFHxrqrc^RCrINkcQnQq*xkk1J zsG5%{`vX)>4^Ep>O(|6)W_hq3b zJekif_Tu9n42XepT!KMywH)U+>{wvt76BcF_sGOi_eVZjHSr+aEg{+18#b4Xg++&s zQ9a{56f}P;uf1bgEISkwn{`}=8F6068CWf@>$uaM**{h-w)ipW&TOFnj{C93n|T0z zXS#CL*&qD4zDQeGCOSe`Z=TgfDug2y>44Yn+A8}mz{Y?`gmJevXS(=2jEfhgrWfj0 z@LXs6StN_-6P+gQ7z6d3J^JtSnx|?-stVgHa=bReItXTuRN-^QL+8^j5sqLAd&P?p z?7=g_j9>`IrMPPgL$_UOFv=O^Z_G?!v^M zB(Bj3?L^6&UDCrI;z$}95TKMDzs6j}Q@!%0xG@t$hRSn}W8+ks?GCQeI_XI37^fR8 zE=+fb`KYImpOD;uW?!K~)u9qnbwrD)ZVm6*U$Y$mbTd~0`siV{Mw+=ww2$qNGna|Z z1o(gzJs#RXO#@YNSo(o4?2W?jrI?W0MjK5-19hX=7P#L)-8{uryQLl5sj5|e)Xh|< zru?W=`%2o|2z zv&_Q|-p{<pa7Ut(2d@FJ5a;OK*&AW4qxEpTd&T#hzpCE-U2QX=?S2)6a;5k8&;-WG>=r6#b$(f?ogAj!4(- zQ)*G0La#6{l|rMh7waY{9$>9otej0PP$k7&DY~RsBgF<=JZji)21455c^C74{Dl$s z5x2Hgaq9<;HO8gaTT=MM!WhM$Aw@!pRZ_el#amKjrMM)%i(#Fy!tpe#CuXOEtEL7h z%_rI15tYE5DRl6^z z4zaj8->vq!0_zGZWjBGHPw%SP$^P2_q=}u_`O~J335nxQp>-0s4O=lO%F+l$5TXtu)}~PvD#OA^ zrB3T;UlyT=RjkUZLJopN!$4Y{Rcb-dw1`PTLO@%j4XNFxO#-Lgv{tmJWo04~!u!5= zzeDxgldSvP_xHQs{odX0?)mu}-oIV*&*6sv%jpxFbLJ*$#mI4C6dR_%hnWw9k1!trA7wrYo@Jf|&oR%5mF1iH!DIJ%E+w$%CH8NP=;$T?g}}jy zfoqoVgqv zWDbJE%wcdHa~&8(>BK})W@F4TaI<1F+l)?v4GC}?a~n9xoCK$sQ{XP`d<2Z5bjBV9&oa+~=a`+M zl(Qb%Obzl+9=eMf<$pZXf~MX}=9(t!3d$iWq8D*TpAr*M>vQfu^xfiVFFjgJF}b>w zeoRgBC#95%yDLqcpGF?Uu6qmco=sEB73(Il&?=tG}PCp4+S59mE(|1$gLU%5{xUHNr zMaHk|<;f6|&4p+wC~Ex9m!wvGPsGiLe5;D4j1x7otD1VY3pg7bV^>9^irD|6MfIXGOae4=cds9j8iY0M{UqBKm~0zatVmMOxUiCcS6_*}m{kTiNoAm{k zo?!i~OTUNpn~wei{l55`vq5i|wrpWNS|j^n)VzDE(`qD8wob{0r9ge7Zmm%1>hyF?F6NTnneuJmb=JS`WI?vj0@r3A|lWnC{E0Kf0<= ze^u$GSSl+t==abG>Gvqbdm09P?|(c#0l4FZgI@!4;`~1;;wAES=R?#XW7g8gv-C5 z^_N`wAnRvbx>`~)|L>OJY7l2!`3^aOu+|lYMY4r&Tt z3jZ%`UGM0#&Q1#PH0%0|==GR~ULoL=wrAAWD{x`)%>T5sV$(ttHTjF(n(Wgr|_S~esE8TJ@V^LT2H5B WV~Xyp{7ii+#c9m%JaS{0BL4$Pln?O$ diff --git a/0D_cache_performance/kernel8.img b/0D_cache_performance/kernel8.img index ec0a16b0869abc221cdd3eaa4386dc7c504a0dd8..d68c9481518e50b021f1db6216a1b09ea652038c 100755 GIT binary patch delta 794 zcmY*XOK1~O6uoa|>P#lBnKaWR4XycDTU&&}5Q^d=F+wc`p~g}y2yI;zn}wK#tD31l zP+Rcv0)j5my3!VLN>&jU3L;2F1kJ*gNh@fG;G&y$<9H`CkvQ-^_ndp~y@xmYh(7!v z!hIDsV-tkjU|BMqdj*i61&Gf?__Ar5f!#u~!~v*bjORnilIj)C-{&A-;9z0SG$#pw zk>ir3=xZNMaeBIv<*1rmN~^-^Rh~fJ;-ivEEqq(-)&5}p>!$gzfzPWn=4kd1M=WPV zbq~EH+J$xzw1idOtxY$JWfb~^*nIsTFKg$T{HkI%*u4A1H zN$xMV0nOZoy#nX1FQJJ+j1xqz6EkN{77~1U2n8-&wGN1n5ot?a)YCQIH6$&{kF9}x zuJ@0syl?1Zv*}Z(sRy)owi6@zU7_0t>_OkO3_8%%v=}Kh`Ln#Ng_{X2PBvSd+~dZm z8{PcE+!b{59H(DFH#4NY#f=-?uzvA4QP_-lnE2RDaRB!XG1ud#JJ=!ukuQL+19~U@ z=EQNcI4WHbiOLF|GsMq+ct&DNCurp~;2TKmwGH8!jFQ?g?7Q$BoD0(NYLF$AKE;sM z{Pd@%ch5=~U-Pr9a*-J9lX8fJ*l*>QU@TWd?3y<&7z=oL;q8%(BShXxRYJwj-pg&= zsScJ1L;uY9l9Gkz!jBg!8_csF7`4wnwstUpPg`G#mVl+vkg~YyPfC2 afd5$I%N%5>?y1g-Ba1jN_>1e^Q{*qhgz&-u delta 1708 zcmZA2ZD<>H90&0K-6d&yw&|9%*_yR!a@MrVi%yOjB;pJiM9GUX;)q&g4?&7#QXzxB zC~g;v1Swi_y&9puXyz+Zh3P6nUkFq7rfi@u);GQ}*%a-&4IeNd#_wIekz(L-pWpxY ze>ljOOZikeQH{u7yT8!RN>bUnBE4eNC9!c)h}jDfWxvzu9Q%DMwJQrTJc0V%P--`D z)b-+@XKgMBp z@9uwdR9)}&>*4N!%7lI!{R3rqd%F3|{K zoTany96bl;=p3A<^P+7O=UL37ah^U87w7`KKrg^Wx(F}Qi*Si9!AtZKyi6~{*XV2T zietkpuAr!}QGr+KRd|hFgV*VGxJK9DO?p$bTMs+uww_b=XJ-T%8u8Xkfqr9ev!w*$&-McJ2?I1F^rw2k{3&eS_jE(|Dy_mk+6Vh-KOCfk zaEK1U2&J?Y{cOidIr{M9Zt{*I7ugAgwiU1Q>O5J!exw2 znhy|#h=|-1cv*^a#Qc%<*?)Gva=qLqY1Yud^U|dC(m+D{rq67i5qKqfw+}vbHB=$0 z2d6~4ANP}&VrI4#vu+Gb9;te-Pny3DjsIm&$X=hc9t^xHRjt%uSemvj48G&4zSo+z zei@u~RWAb*zA u32 { - const CACHELINE_SIZE_BYTES: u64 = 64; // TODO: retrieve this from a system register - const NUM_CACHELINES_TOUCHED: u64 = 5; - const BYTES_PER_U64_REG: usize = 8; - const NUM_BENCH_ITERATIONS: u64 = 100_000; + const CACHELINE_SIZE_BYTES: usize = 64; // TODO: retrieve this from a system register + const NUM_CACHELINES_TOUCHED: usize = 5; + const NUM_BENCH_ITERATIONS: usize = 20_000; - const NUM_BYTES_TOUCHED: u64 = CACHELINE_SIZE_BYTES * NUM_CACHELINES_TOUCHED; + const NUM_BYTES_TOUCHED: usize = CACHELINE_SIZE_BYTES * NUM_CACHELINES_TOUCHED; + let mem = unsafe { core::slice::from_raw_parts_mut(addr as *mut u64, NUM_BYTES_TOUCHED) }; + + // Benchmark starts here let t1 = CNTPCT_EL0.get(); compiler_fence(Ordering::SeqCst); - let mut data_ptr: *mut u64; let mut temp: u64; for _ in 0..NUM_BENCH_ITERATIONS { - for i in (addr..(addr + NUM_BYTES_TOUCHED)).step_by(BYTES_PER_U64_REG) { - data_ptr = i as *mut u64; - + for qword in mem.iter_mut() { unsafe { - temp = core::ptr::read_volatile(data_ptr); - core::ptr::write_volatile(data_ptr, temp + 1); + temp = core::ptr::read_volatile(qword); + core::ptr::write_volatile(qword, temp + 1); } } } diff --git a/0D_cache_performance/src/mmu.rs b/0D_cache_performance/src/mmu.rs index a379c769..df48d434 100644 --- a/0D_cache_performance/src/mmu.rs +++ b/0D_cache_performance/src/mmu.rs @@ -211,8 +211,8 @@ pub unsafe fn init() { // First, force all previous changes to be seen before the MMU is enabled. barrier::isb(barrier::SY); - // Enable the MMU and turn on caching - SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable); + // Enable the MMU and turn on data and instruction caching. + SCTLR_EL1.modify(SCTLR_EL1::M::Enable + SCTLR_EL1::C::Cacheable + SCTLR_EL1::I::Cacheable); // Force MMU init to complete before next instruction barrier::isb(barrier::SY);