diff --git a/README.md b/README.md
index 68a8fc4..804bdb1 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ This repository hosts a RISC-V implementation written in SpinalHDL. Here are som
 
 - RV32I[M][A][F[D]][C] instruction set
 - Pipelined from 2 to 5+ stages ([Fetch*X], Decode, Execute, [Memory], [WriteBack])
-- 1.44 DMIPS/Mhz --no-inline when nearly all features are enabled (1.57 DMIPS/Mhz when the divider lookup table is enabled)
+- 1.44 DMIPS/MHz --no-inline when nearly all features are enabled (1.57 DMIPS/MHz when the divider lookup table is enabled)
 - Optimized for FPGA, does not use any vendor specific IP block / primitive
 - AXI4, Avalon, wishbone ready
 - Optional MUL/DIV extensions
@@ -97,54 +97,54 @@ dhrystone binaries which fit inside a 4KB I$ and 4KB D$ (I already had this case
 The CPU configurations used below can be found in the `src/scala/vexriscv/demo` directory.
 
 ```
-VexRiscv small (RV32I, 0.52 DMIPS/Mhz, no datapath bypass, no interrupt) ->
-    Artix 7     -> 243 Mhz 504 LUT 505 FF 
-    Cyclone V   -> 174 Mhz 352 ALMs
-    Cyclone IV  -> 179 Mhz 731 LUT 494 FF 
-    iCE40       -> 92 Mhz 1130 LC
+VexRiscv small (RV32I, 0.52 DMIPS/MHz, no datapath bypass, no interrupt) ->
+    Artix 7     -> 243 MHz 504 LUT 505 FF 
+    Cyclone V   -> 174 MHz 352 ALMs
+    Cyclone IV  -> 179 MHz 731 LUT 494 FF 
+    iCE40       -> 92 MHz 1130 LC
 
-VexRiscv small (RV32I, 0.52 DMIPS/Mhz, no datapath bypass) ->
-    Artix 7     -> 240 Mhz 556 LUT 566 FF 
-    Cyclone V   -> 194 Mhz 394 ALMs
-    Cyclone IV  -> 174 Mhz 831 LUT 555 FF 
-    iCE40       -> 85 Mhz 1292 LC
+VexRiscv small (RV32I, 0.52 DMIPS/MHz, no datapath bypass) ->
+    Artix 7     -> 240 MHz 556 LUT 566 FF 
+    Cyclone V   -> 194 MHz 394 ALMs
+    Cyclone IV  -> 174 MHz 831 LUT 555 FF 
+    iCE40       -> 85 MHz 1292 LC
 
-VexRiscv small and productive (RV32I, 0.82 DMIPS/Mhz)  ->
-    Artix 7     -> 232 Mhz 816 LUT 534 FF 
-    Cyclone V   -> 155 Mhz 492 ALMs
-    Cyclone IV  -> 155 Mhz 1,111 LUT 530 FF 
-    iCE40       -> 63 Mhz 1596 LC
+VexRiscv small and productive (RV32I, 0.82 DMIPS/MHz)  ->
+    Artix 7     -> 232 MHz 816 LUT 534 FF 
+    Cyclone V   -> 155 MHz 492 ALMs
+    Cyclone IV  -> 155 MHz 1,111 LUT 530 FF 
+    iCE40       -> 63 MHz 1596 LC
 
-VexRiscv small and productive with I$ (RV32I, 0.70 DMIPS/Mhz, 4KB-I$)  ->
-    Artix 7     -> 220 Mhz 730 LUT 570 FF 
-    Cyclone V   -> 142 Mhz 501 ALMs
-    Cyclone IV  -> 150 Mhz 1,139 LUT 536 FF 
-    iCE40       -> 66 Mhz 1680 LC
+VexRiscv small and productive with I$ (RV32I, 0.70 DMIPS/MHz, 4KB-I$)  ->
+    Artix 7     -> 220 MHz 730 LUT 570 FF 
+    Cyclone V   -> 142 MHz 501 ALMs
+    Cyclone IV  -> 150 MHz 1,139 LUT 536 FF 
+    iCE40       -> 66 MHz 1680 LC
 
-VexRiscv full no cache (RV32IM, 1.21 DMIPS/Mhz 2.30 Coremark/Mhz, single cycle barrel shifter, debug module, catch exceptions, static branch) ->
-    Artix 7     -> 216 Mhz 1418 LUT 949 FF 
-    Cyclone V   -> 133 Mhz 933 ALMs
-    Cyclone IV  -> 143 Mhz 2,076 LUT 972 FF 
+VexRiscv full no cache (RV32IM, 1.21 DMIPS/MHz 2.30 Coremark/MHz, single cycle barrel shifter, debug module, catch exceptions, static branch) ->
+    Artix 7     -> 216 MHz 1418 LUT 949 FF 
+    Cyclone V   -> 133 MHz 933 ALMs
+    Cyclone IV  -> 143 MHz 2,076 LUT 972 FF 
 
-VexRiscv full (RV32IM, 1.21 DMIPS/Mhz 2.30 Coremark/Mhz with cache trashing, 4KB-I$,4KB-D$, single cycle barrel shifter, debug module, catch exceptions, static branch) ->
-    Artix 7     -> 199 Mhz 1840 LUT 1158 FF 
-    Cyclone V   -> 141 Mhz 1,166 ALMs
-    Cyclone IV  -> 131 Mhz 2,407 LUT 1,067 FF 
+VexRiscv full (RV32IM, 1.21 DMIPS/MHz 2.30 Coremark/MHz with cache trashing, 4KB-I$,4KB-D$, single cycle barrel shifter, debug module, catch exceptions, static branch) ->
+    Artix 7     -> 199 MHz 1840 LUT 1158 FF 
+    Cyclone V   -> 141 MHz 1,166 ALMs
+    Cyclone IV  -> 131 MHz 2,407 LUT 1,067 FF 
 
-VexRiscv full max perf (HZ*IPC) -> (RV32IM, 1.38 DMIPS/Mhz 2.57 Coremark/Mhz, 8KB-I$,8KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch prediction in the fetch stage, branch and shift operations done in the Execute stage) ->
-    Artix 7     -> 200 Mhz 1935 LUT 1216 FF 
-    Cyclone V   -> 130 Mhz 1,166 ALMs
-    Cyclone IV  -> 126 Mhz 2,484 LUT 1,120 FF 
+VexRiscv full max perf (HZ*IPC) -> (RV32IM, 1.38 DMIPS/MHz 2.57 Coremark/MHz, 8KB-I$,8KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch prediction in the fetch stage, branch and shift operations done in the Execute stage) ->
+    Artix 7     -> 200 MHz 1935 LUT 1216 FF 
+    Cyclone V   -> 130 MHz 1,166 ALMs
+    Cyclone IV  -> 126 MHz 2,484 LUT 1,120 FF 
 
-VexRiscv full with MMU (RV32IM, 1.24 DMIPS/Mhz 2.35 Coremark/Mhz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch, MMU) ->
-    Artix 7     -> 151 Mhz 2021 LUT 1541 FF 
-    Cyclone V   -> 124 Mhz 1,368 ALMs
-    Cyclone IV -> 128 Mhz 2,826 LUT 1,474 FF 
+VexRiscv full with MMU (RV32IM, 1.24 DMIPS/MHz 2.35 Coremark/MHz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch, MMU) ->
+    Artix 7     -> 151 MHz 2021 LUT 1541 FF 
+    Cyclone V   -> 124 MHz 1,368 ALMs
+    Cyclone IV -> 128 MHz 2,826 LUT 1,474 FF 
 
-VexRiscv linux balanced (RV32IMA, 1.21 DMIPS/Mhz 2.27 Coremark/Mhz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, catch exceptions, static branch, MMU, Supervisor, Compatible with mainstream linux) ->
-    Artix 7     -> 180 Mhz 2883 LUT 2130 FF 
-    Cyclone V   -> 131 Mhz 1,764 ALMs
-    Cyclone IV  -> 121 Mhz 3,608 LUT 2,082 FF 
+VexRiscv linux balanced (RV32IMA, 1.21 DMIPS/MHz 2.27 Coremark/MHz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, catch exceptions, static branch, MMU, Supervisor, Compatible with mainstream linux) ->
+    Artix 7     -> 180 MHz 2883 LUT 2130 FF 
+    Cyclone V   -> 131 MHz 1,764 ALMs
+    Cyclone IV  -> 121 MHz 3,608 LUT 2,082 FF 
 ```
 
 The following configuration results in 1.44 DMIPS/MHz:
@@ -157,7 +157,7 @@ The following configuration results in 1.44 DMIPS/MHz:
 - single cycle multiplication with bypassing in the WB stage (late result)
 - dynamic branch prediction done in the F stage with a direct mapped target buffer cache (no penalties on correct predictions)
 
-Note that, recently, the capability to remove the Fetch/Memory/WriteBack stage was added to reduce the area of the CPU, which ends up with a smaller CPU and a better DMIPS/Mhz for the small configurations.
+Note that, recently, the capability to remove the Fetch/Memory/WriteBack stage was added to reduce the area of the CPU, which ends up with a smaller CPU and a better DMIPS/MHz for the small configurations.
 
 ## Dependencies
 
@@ -361,9 +361,9 @@ You can find some FPGA projects which instantiate the Briey SoC here (DE1-SoC, D
 Here are some measurements of Briey SoC timings and area:
 
 ```
-Artix 7     -> 181 Mhz 3220 LUT 3181 FF 
-Cyclone V   -> 142 Mhz 2,222 ALMs
-Cyclone IV  -> 130 Mhz 4,538 LUT 3,211 FF 
+Artix 7     -> 181 MHz 3220 LUT 3181 FF 
+Cyclone V   -> 142 MHz 2,222 ALMs
+Cyclone IV  -> 130 MHz 4,538 LUT 3,211 FF 
 ```
 
 ## Murax SoC
@@ -379,8 +379,8 @@ Murax is a very light SoC (it fits in an ICE40 FPGA) which can work without any
 - one UART with tx/rx fifo
 
 Depending on the CPU configuration, on the ICE40-hx8k FPGA with icestorm for synthesis, the full SoC has the following area/performance:
-- RV32I interlocked stages => 51 Mhz, 2387 LC 0.45 DMIPS/Mhz
-- RV32I bypassed stages    => 45 Mhz, 2718 LC 0.65 DMIPS/Mhz
+- RV32I interlocked stages => 51 MHz, 2387 LC 0.45 DMIPS/MHz
+- RV32I bypassed stages    => 45 MHz, 2718 LC 0.65 DMIPS/MHz
 
 Its implementation can be found here: `src/main/scala/vexriscv/demo/Murax.scala`.
 
@@ -415,17 +415,17 @@ You can find multiple software examples and demos here: https://github.com/Spina
 Here are some timing and area measurements of the Murax SoC:
 
 ```
-Murax interlocked stages (0.45 DMIPS/Mhz, 8 bits GPIO) ->
-    Artix 7     -> 216 Mhz 1109 LUT 1201 FF 
-    Cyclone V   -> 182 Mhz 725 ALMs
-    Cyclone IV  -> 147 Mhz 1,551 LUT 1,223 FF 
-    iCE40       ->  64 Mhz 2422 LC (nextpnr)
+Murax interlocked stages (0.45 DMIPS/MHz, 8 bits GPIO) ->
+    Artix 7     -> 216 MHz 1109 LUT 1201 FF 
+    Cyclone V   -> 182 MHz 725 ALMs
+    Cyclone IV  -> 147 MHz 1,551 LUT 1,223 FF 
+    iCE40       ->  64 MHz 2422 LC (nextpnr)
 
-MuraxFast bypassed stages (0.65 DMIPS/Mhz, 8 bits GPIO) ->
-    Artix 7     -> 224 Mhz 1278 LUT 1300 FF 
-    Cyclone V   -> 173 Mhz 867 ALMs
-    Cyclone IV  -> 143 Mhz 1,755 LUT 1,258 FF 
-    iCE40       ->  66 Mhz 2799 LC (nextpnr)
+MuraxFast bypassed stages (0.65 DMIPS/MHz, 8 bits GPIO) ->
+    Artix 7     -> 224 MHz 1278 LUT 1300 FF 
+    Cyclone V   -> 173 MHz 867 ALMs
+    Cyclone IV  -> 143 MHz 1,755 LUT 1,258 FF 
+    iCE40       ->  66 MHz 2799 LC (nextpnr)
 ```
 
 Some scripts to generate the SoC and call the icestorm toolchain can be found here: `scripts/Murax/`
@@ -814,11 +814,11 @@ Synthesis results of the FPU itself, without the CPU integration, on the fast sp
 
 ```
 Fpu 32 bits ->
-  Artix 7 relaxed -> 135 Mhz 1786 LUT 1778 FF 
-  Artix 7 FMax    -> 205 Mhz 2101 LUT 1778 FF 
+  Artix 7 relaxed -> 135 MHz 1786 LUT 1778 FF 
+  Artix 7 FMax    -> 205 MHz 2101 LUT 1778 FF 
 Fpu 64/32 bits ->
-  Artix 7 relaxed -> 101 Mhz 3336 LUT 3033 FF 
-  Artix 7 FMax    -> 165 Mhz 3728 LUT 3175 FF 
+  Artix 7 relaxed -> 101 MHz 3336 LUT 3033 FF 
+  Artix 7 FMax    -> 165 MHz 3728 LUT 3175 FF 
 ```
 
 Note that if you want to debug FPU code via the openocd_riscv.vexriscv target, you need to use the GDB from : 
diff --git a/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala b/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala
index f3d4f6c..486912a 100644
--- a/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala
+++ b/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala
@@ -27,8 +27,8 @@ import vexriscv.periph.tasks.hash._
 /** Created by PIC32F_USER on 28/07/2017.
   *
   * Murax is a very light SoC which could work without any external component.
-  *   - ICE40-hx8k + icestorm => 53 Mhz, 2142 LC
-  *   - 0.37 DMIPS/Mhz
+  *   - ICE40-hx8k + icestorm => 53 MHz, 2142 LC
+  *   - 0.37 DMIPS/MHz
   *   - 8 kB of on-chip ram
   *   - JTAG debugger (eclipse/GDB/openocd ready)
   *   - Interrupt support
diff --git a/src/main/scala/vexriscv/demo/Murax.scala b/src/main/scala/vexriscv/demo/Murax.scala
index 95a35b5..d7022f5 100644
--- a/src/main/scala/vexriscv/demo/Murax.scala
+++ b/src/main/scala/vexriscv/demo/Murax.scala
@@ -22,8 +22,8 @@ import scala.collection.Seq
  * Created by PIC32F_USER on 28/07/2017.
  *
  * Murax is a very light SoC which could work without any external component.
- * - ICE40-hx8k + icestorm =>  53 Mhz, 2142 LC
- * - 0.37 DMIPS/Mhz
+ * - ICE40-hx8k + icestorm =>  53 MHz, 2142 LC
+ * - 0.37 DMIPS/MHz
  * - 8 kB of on-chip ram
  * - JTAG debugger (eclipse/GDB/openocd ready)
  * - Interrupt support
diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
index b78f84f..0f46a25 100644
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -1758,17 +1758,17 @@ object FpuSynthesisBench extends App{
   }
 
 //    rotate2_24 ->
-//    Artix 7 -> 233 Mhz 96 LUT 167 FF
-//  Artix 7 -> 420 Mhz 86 LUT 229 FF
+//    Artix 7 -> 233 MHz 96 LUT 167 FF
+//  Artix 7 -> 420 MHz 86 LUT 229 FF
 //  rotate2_32 ->
-//    Artix 7 -> 222 Mhz 108 LUT 238 FF
-//  Artix 7 -> 399 Mhz 110 LUT 300 FF
+//    Artix 7 -> 222 MHz 108 LUT 238 FF
+//  Artix 7 -> 399 MHz 110 LUT 300 FF
 //  rotate2_52 ->
-//    Artix 7 -> 195 Mhz 230 LUT 362 FF
-//  Artix 7 -> 366 Mhz 225 LUT 486 FF
+//    Artix 7 -> 195 MHz 230 LUT 362 FF
+//  Artix 7 -> 366 MHz 225 LUT 486 FF
 //  rotate2_64 ->
-//    Artix 7 -> 182 Mhz 257 LUT 465 FF
-//  Artix 7 -> 359 Mhz 266 LUT 591 FF
+//    Artix 7 -> 182 MHz 257 LUT 465 FF
+//  Artix 7 -> 359 MHz 266 LUT 591 FF
   class Rotate2(width : Int) extends Rtl{
     override def getName(): String = "rotate2_" + width
     override def getRtlPath(): String = getName() + ".v"
@@ -1858,56 +1858,56 @@ object FpuSynthesisBench extends App{
 }
 
 //Fpu_32 ->
-//Artix 7 -> 136 Mhz 1471 LUT 1336 FF
-//Artix 7 -> 196 Mhz 1687 LUT 1371 FF
+//Artix 7 -> 136 MHz 1471 LUT 1336 FF
+//Artix 7 -> 196 MHz 1687 LUT 1371 FF
 //Fpu_64 ->
-//Artix 7 -> 105 Mhz 2822 LUT 2132 FF
-//Artix 7 -> 161 Mhz 3114 LUT 2272 FF
+//Artix 7 -> 105 MHz 2822 LUT 2132 FF
+//Artix 7 -> 161 MHz 3114 LUT 2272 FF
 //
 //
 //
 //Fpu_32 ->
-//Artix 7 -> 128 Mhz 1693 LUT 1481 FF
-//Artix 7 -> 203 Mhz 1895 LUT 1481 FF
+//Artix 7 -> 128 MHz 1693 LUT 1481 FF
+//Artix 7 -> 203 MHz 1895 LUT 1481 FF
 //Fpu_64 ->
-//Artix 7 -> 99 Mhz 3073 LUT 2396 FF
-//Artix 7 -> 164 Mhz 3433 LUT 2432 FF
+//Artix 7 -> 99 MHz 3073 LUT 2396 FF
+//Artix 7 -> 164 MHz 3433 LUT 2432 FF
 
 
 //Fpu_32 ->
-//Artix 7 -> 112 Mhz 1790 LUT 1666 FF
-//Artix 7 -> 158 Mhz 1989 LUT 1701 FF
+//Artix 7 -> 112 MHz 1790 LUT 1666 FF
+//Artix 7 -> 158 MHz 1989 LUT 1701 FF
 //Fpu_64 ->
-//Artix 7 -> 100 Mhz 3294 LUT 2763 FF
-//Artix 7 -> 151 Mhz 3708 LUT 2904 FF
+//Artix 7 -> 100 MHz 3294 LUT 2763 FF
+//Artix 7 -> 151 MHz 3708 LUT 2904 FF
 
 //Fpu_32 ->
-//Artix 7 -> 139 Mhz 1879 LUT 1713 FF
-//Artix 7 -> 206 Mhz 2135 LUT 1723 FF
+//Artix 7 -> 139 MHz 1879 LUT 1713 FF
+//Artix 7 -> 206 MHz 2135 LUT 1723 FF
 //Fpu_64 ->
-//Artix 7 -> 106 Mhz 3502 LUT 2811 FF
-//Artix 7 -> 163 Mhz 3905 LUT 2951 FF
+//Artix 7 -> 106 MHz 3502 LUT 2811 FF
+//Artix 7 -> 163 MHz 3905 LUT 2951 FF
 
 //Fpu_32 ->
-//Artix 7 -> 130 Mhz 1889 LUT 1835 FF
-//Artix 7 -> 210 Mhz 2131 LUT 1845 FF
+//Artix 7 -> 130 MHz 1889 LUT 1835 FF
+//Artix 7 -> 210 MHz 2131 LUT 1845 FF
 //Fpu_64 ->
-//Artix 7 -> 106 Mhz 3322 LUT 3023 FF
-//Artix 7 -> 161 Mhz 3675 LUT 3163 FF
+//Artix 7 -> 106 MHz 3322 LUT 3023 FF
+//Artix 7 -> 161 MHz 3675 LUT 3163 FF
 
 //Fpu_32 ->
-//Artix 7 -> 132 Mhz 1891 LUT 1837 FF
-//Artix 7 -> 209 Mhz 2132 LUT 1847 FF
+//Artix 7 -> 132 MHz 1891 LUT 1837 FF
+//Artix 7 -> 209 MHz 2132 LUT 1847 FF
 //Fpu_64 ->
-//Artix 7 -> 105 Mhz 3348 LUT 3024 FF
-//Artix 7 -> 162 Mhz 3712 LUT 3165 FF
+//Artix 7 -> 105 MHz 3348 LUT 3024 FF
+//Artix 7 -> 162 MHz 3712 LUT 3165 FF
 
 //Fpu_32 ->
-//Artix 7 -> 128 Mhz 1796 LUT 1727 FF
-//Artix 7 -> 208 Mhz 2049 LUT 1727 FF
+//Artix 7 -> 128 MHz 1796 LUT 1727 FF
+//Artix 7 -> 208 MHz 2049 LUT 1727 FF
 //Fpu_64 ->
-//Artix 7 -> 109 Mhz 3417 LUT 2913 FF
-//Artix 7 -> 168 Mhz 3844 LUT 3053 FF
+//Artix 7 -> 109 MHz 3417 LUT 2913 FF
+//Artix 7 -> 168 MHz 3844 LUT 3053 FF
 
 /*
 testfloat  -tininessafter -all1 > all1.txt
diff --git a/src/main/scala/vexriscv/plugin/AesPlugin.scala b/src/main/scala/vexriscv/plugin/AesPlugin.scala
index 0d4556a..eec48e5 100644
--- a/src/main/scala/vexriscv/plugin/AesPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/AesPlugin.scala
@@ -53,7 +53,7 @@ import vexriscv.{DecoderService, Stageable, VexRiscv}
   * - SS specify which byte should be used from RS2 for the processing
   *
   * In practice the aes-256-cbc performances should improve by a factor 4. See the following results from libopenssl
-  * from a SoC running linux at 100 Mhz
+  * from a SoC running linux at 100 MHz
   *   type                 16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes  16384 bytes
   *   aes-256-cbc SW         492.58k      700.22k      796.41k      831.49k      830.09k      832.81k
   *   aes-256 cbc HW        1781.52k     2834.07k     3323.07k     3486.72k     3465.22k     3440.10k
diff --git a/src/test/scala/vexriscv/DhrystoneBench.scala b/src/test/scala/vexriscv/DhrystoneBench.scala
index 48d1b67..3fabf42 100644
--- a/src/test/scala/vexriscv/DhrystoneBench.scala
+++ b/src/test/scala/vexriscv/DhrystoneBench.scala
@@ -45,7 +45,7 @@ class DhrystoneBench extends AnyFunSuite {
       val coremarkIterations = intFind.findFirstIn("Iterations       \\: (\\d+.?)+".r.findAllIn(str).toList.last).get.toDouble
       val coremarkHzs = intFind.findFirstIn("DCLOCKS_PER_SEC=(\\d+.?)+".r.findAllIn(str).toList.last).get.toDouble
       val coremarkPerMhz = 1e6 * coremarkIterations / coremarkTicks
-      report ++= s"$name -> $dmips DMIPS/Mhz $coremarkPerMhz Coremark/Mhz\n"
+      report ++= s"$name -> $dmips DMIPS/MHz $coremarkPerMhz Coremark/MHz\n"
     }
 
   }