diff --git a/README.md b/README.md index 68a8fc4..804bdb1 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ This repository hosts a RISC-V implementation written in SpinalHDL. Here are som - RV32I[M][A][F[D]][C] instruction set - Pipelined from 2 to 5+ stages ([Fetch*X], Decode, Execute, [Memory], [WriteBack]) -- 1.44 DMIPS/Mhz --no-inline when nearly all features are enabled (1.57 DMIPS/Mhz when the divider lookup table is enabled) +- 1.44 DMIPS/MHz --no-inline when nearly all features are enabled (1.57 DMIPS/MHz when the divider lookup table is enabled) - Optimized for FPGA, does not use any vendor specific IP block / primitive - AXI4, Avalon, wishbone ready - Optional MUL/DIV extensions @@ -97,54 +97,54 @@ dhrystone binaries which fit inside a 4KB I$ and 4KB D$ (I already had this case The CPU configurations used below can be found in the `src/scala/vexriscv/demo` directory. ``` -VexRiscv small (RV32I, 0.52 DMIPS/Mhz, no datapath bypass, no interrupt) -> - Artix 7 -> 243 Mhz 504 LUT 505 FF - Cyclone V -> 174 Mhz 352 ALMs - Cyclone IV -> 179 Mhz 731 LUT 494 FF - iCE40 -> 92 Mhz 1130 LC +VexRiscv small (RV32I, 0.52 DMIPS/MHz, no datapath bypass, no interrupt) -> + Artix 7 -> 243 MHz 504 LUT 505 FF + Cyclone V -> 174 MHz 352 ALMs + Cyclone IV -> 179 MHz 731 LUT 494 FF + iCE40 -> 92 MHz 1130 LC -VexRiscv small (RV32I, 0.52 DMIPS/Mhz, no datapath bypass) -> - Artix 7 -> 240 Mhz 556 LUT 566 FF - Cyclone V -> 194 Mhz 394 ALMs - Cyclone IV -> 174 Mhz 831 LUT 555 FF - iCE40 -> 85 Mhz 1292 LC +VexRiscv small (RV32I, 0.52 DMIPS/MHz, no datapath bypass) -> + Artix 7 -> 240 MHz 556 LUT 566 FF + Cyclone V -> 194 MHz 394 ALMs + Cyclone IV -> 174 MHz 831 LUT 555 FF + iCE40 -> 85 MHz 1292 LC -VexRiscv small and productive (RV32I, 0.82 DMIPS/Mhz) -> - Artix 7 -> 232 Mhz 816 LUT 534 FF - Cyclone V -> 155 Mhz 492 ALMs - Cyclone IV -> 155 Mhz 1,111 LUT 530 FF - iCE40 -> 63 Mhz 1596 LC +VexRiscv small and productive (RV32I, 0.82 DMIPS/MHz) -> + Artix 7 -> 232 MHz 816 LUT 534 FF + Cyclone V -> 155 MHz 492 ALMs + Cyclone IV -> 155 MHz 1,111 LUT 530 FF + iCE40 -> 63 MHz 1596 LC -VexRiscv small and productive with I$ (RV32I, 0.70 DMIPS/Mhz, 4KB-I$) -> - Artix 7 -> 220 Mhz 730 LUT 570 FF - Cyclone V -> 142 Mhz 501 ALMs - Cyclone IV -> 150 Mhz 1,139 LUT 536 FF - iCE40 -> 66 Mhz 1680 LC +VexRiscv small and productive with I$ (RV32I, 0.70 DMIPS/MHz, 4KB-I$) -> + Artix 7 -> 220 MHz 730 LUT 570 FF + Cyclone V -> 142 MHz 501 ALMs + Cyclone IV -> 150 MHz 1,139 LUT 536 FF + iCE40 -> 66 MHz 1680 LC -VexRiscv full no cache (RV32IM, 1.21 DMIPS/Mhz 2.30 Coremark/Mhz, single cycle barrel shifter, debug module, catch exceptions, static branch) -> - Artix 7 -> 216 Mhz 1418 LUT 949 FF - Cyclone V -> 133 Mhz 933 ALMs - Cyclone IV -> 143 Mhz 2,076 LUT 972 FF +VexRiscv full no cache (RV32IM, 1.21 DMIPS/MHz 2.30 Coremark/MHz, single cycle barrel shifter, debug module, catch exceptions, static branch) -> + Artix 7 -> 216 MHz 1418 LUT 949 FF + Cyclone V -> 133 MHz 933 ALMs + Cyclone IV -> 143 MHz 2,076 LUT 972 FF -VexRiscv full (RV32IM, 1.21 DMIPS/Mhz 2.30 Coremark/Mhz with cache trashing, 4KB-I$,4KB-D$, single cycle barrel shifter, debug module, catch exceptions, static branch) -> - Artix 7 -> 199 Mhz 1840 LUT 1158 FF - Cyclone V -> 141 Mhz 1,166 ALMs - Cyclone IV -> 131 Mhz 2,407 LUT 1,067 FF +VexRiscv full (RV32IM, 1.21 DMIPS/MHz 2.30 Coremark/MHz with cache trashing, 4KB-I$,4KB-D$, single cycle barrel shifter, debug module, catch exceptions, static branch) -> + Artix 7 -> 199 MHz 1840 LUT 1158 FF + Cyclone V -> 141 MHz 1,166 ALMs + Cyclone IV -> 131 MHz 2,407 LUT 1,067 FF -VexRiscv full max perf (HZ*IPC) -> (RV32IM, 1.38 DMIPS/Mhz 2.57 Coremark/Mhz, 8KB-I$,8KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch prediction in the fetch stage, branch and shift operations done in the Execute stage) -> - Artix 7 -> 200 Mhz 1935 LUT 1216 FF - Cyclone V -> 130 Mhz 1,166 ALMs - Cyclone IV -> 126 Mhz 2,484 LUT 1,120 FF +VexRiscv full max perf (HZ*IPC) -> (RV32IM, 1.38 DMIPS/MHz 2.57 Coremark/MHz, 8KB-I$,8KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch prediction in the fetch stage, branch and shift operations done in the Execute stage) -> + Artix 7 -> 200 MHz 1935 LUT 1216 FF + Cyclone V -> 130 MHz 1,166 ALMs + Cyclone IV -> 126 MHz 2,484 LUT 1,120 FF -VexRiscv full with MMU (RV32IM, 1.24 DMIPS/Mhz 2.35 Coremark/Mhz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch, MMU) -> - Artix 7 -> 151 Mhz 2021 LUT 1541 FF - Cyclone V -> 124 Mhz 1,368 ALMs - Cyclone IV -> 128 Mhz 2,826 LUT 1,474 FF +VexRiscv full with MMU (RV32IM, 1.24 DMIPS/MHz 2.35 Coremark/MHz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, debug module, catch exceptions, dynamic branch, MMU) -> + Artix 7 -> 151 MHz 2021 LUT 1541 FF + Cyclone V -> 124 MHz 1,368 ALMs + Cyclone IV -> 128 MHz 2,826 LUT 1,474 FF -VexRiscv linux balanced (RV32IMA, 1.21 DMIPS/Mhz 2.27 Coremark/Mhz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, catch exceptions, static branch, MMU, Supervisor, Compatible with mainstream linux) -> - Artix 7 -> 180 Mhz 2883 LUT 2130 FF - Cyclone V -> 131 Mhz 1,764 ALMs - Cyclone IV -> 121 Mhz 3,608 LUT 2,082 FF +VexRiscv linux balanced (RV32IMA, 1.21 DMIPS/MHz 2.27 Coremark/MHz, with cache trashing, 4KB-I$, 4KB-D$, single cycle barrel shifter, catch exceptions, static branch, MMU, Supervisor, Compatible with mainstream linux) -> + Artix 7 -> 180 MHz 2883 LUT 2130 FF + Cyclone V -> 131 MHz 1,764 ALMs + Cyclone IV -> 121 MHz 3,608 LUT 2,082 FF ``` The following configuration results in 1.44 DMIPS/MHz: @@ -157,7 +157,7 @@ The following configuration results in 1.44 DMIPS/MHz: - single cycle multiplication with bypassing in the WB stage (late result) - dynamic branch prediction done in the F stage with a direct mapped target buffer cache (no penalties on correct predictions) -Note that, recently, the capability to remove the Fetch/Memory/WriteBack stage was added to reduce the area of the CPU, which ends up with a smaller CPU and a better DMIPS/Mhz for the small configurations. +Note that, recently, the capability to remove the Fetch/Memory/WriteBack stage was added to reduce the area of the CPU, which ends up with a smaller CPU and a better DMIPS/MHz for the small configurations. ## Dependencies @@ -361,9 +361,9 @@ You can find some FPGA projects which instantiate the Briey SoC here (DE1-SoC, D Here are some measurements of Briey SoC timings and area: ``` -Artix 7 -> 181 Mhz 3220 LUT 3181 FF -Cyclone V -> 142 Mhz 2,222 ALMs -Cyclone IV -> 130 Mhz 4,538 LUT 3,211 FF +Artix 7 -> 181 MHz 3220 LUT 3181 FF +Cyclone V -> 142 MHz 2,222 ALMs +Cyclone IV -> 130 MHz 4,538 LUT 3,211 FF ``` ## Murax SoC @@ -379,8 +379,8 @@ Murax is a very light SoC (it fits in an ICE40 FPGA) which can work without any - one UART with tx/rx fifo Depending on the CPU configuration, on the ICE40-hx8k FPGA with icestorm for synthesis, the full SoC has the following area/performance: -- RV32I interlocked stages => 51 Mhz, 2387 LC 0.45 DMIPS/Mhz -- RV32I bypassed stages => 45 Mhz, 2718 LC 0.65 DMIPS/Mhz +- RV32I interlocked stages => 51 MHz, 2387 LC 0.45 DMIPS/MHz +- RV32I bypassed stages => 45 MHz, 2718 LC 0.65 DMIPS/MHz Its implementation can be found here: `src/main/scala/vexriscv/demo/Murax.scala`. @@ -415,17 +415,17 @@ You can find multiple software examples and demos here: https://github.com/Spina Here are some timing and area measurements of the Murax SoC: ``` -Murax interlocked stages (0.45 DMIPS/Mhz, 8 bits GPIO) -> - Artix 7 -> 216 Mhz 1109 LUT 1201 FF - Cyclone V -> 182 Mhz 725 ALMs - Cyclone IV -> 147 Mhz 1,551 LUT 1,223 FF - iCE40 -> 64 Mhz 2422 LC (nextpnr) +Murax interlocked stages (0.45 DMIPS/MHz, 8 bits GPIO) -> + Artix 7 -> 216 MHz 1109 LUT 1201 FF + Cyclone V -> 182 MHz 725 ALMs + Cyclone IV -> 147 MHz 1,551 LUT 1,223 FF + iCE40 -> 64 MHz 2422 LC (nextpnr) -MuraxFast bypassed stages (0.65 DMIPS/Mhz, 8 bits GPIO) -> - Artix 7 -> 224 Mhz 1278 LUT 1300 FF - Cyclone V -> 173 Mhz 867 ALMs - Cyclone IV -> 143 Mhz 1,755 LUT 1,258 FF - iCE40 -> 66 Mhz 2799 LC (nextpnr) +MuraxFast bypassed stages (0.65 DMIPS/MHz, 8 bits GPIO) -> + Artix 7 -> 224 MHz 1278 LUT 1300 FF + Cyclone V -> 173 MHz 867 ALMs + Cyclone IV -> 143 MHz 1,755 LUT 1,258 FF + iCE40 -> 66 MHz 2799 LC (nextpnr) ``` Some scripts to generate the SoC and call the icestorm toolchain can be found here: `scripts/Murax/` @@ -814,11 +814,11 @@ Synthesis results of the FPU itself, without the CPU integration, on the fast sp ``` Fpu 32 bits -> - Artix 7 relaxed -> 135 Mhz 1786 LUT 1778 FF - Artix 7 FMax -> 205 Mhz 2101 LUT 1778 FF + Artix 7 relaxed -> 135 MHz 1786 LUT 1778 FF + Artix 7 FMax -> 205 MHz 2101 LUT 1778 FF Fpu 64/32 bits -> - Artix 7 relaxed -> 101 Mhz 3336 LUT 3033 FF - Artix 7 FMax -> 165 Mhz 3728 LUT 3175 FF + Artix 7 relaxed -> 101 MHz 3336 LUT 3033 FF + Artix 7 FMax -> 165 MHz 3728 LUT 3175 FF ``` Note that if you want to debug FPU code via the openocd_riscv.vexriscv target, you need to use the GDB from : diff --git a/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala b/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala index f3d4f6c..486912a 100644 --- a/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala +++ b/doc/gcdPeripheral/src/main/scala/vexriscv/demo/Murax.scala @@ -27,8 +27,8 @@ import vexriscv.periph.tasks.hash._ /** Created by PIC32F_USER on 28/07/2017. * * Murax is a very light SoC which could work without any external component. - * - ICE40-hx8k + icestorm => 53 Mhz, 2142 LC - * - 0.37 DMIPS/Mhz + * - ICE40-hx8k + icestorm => 53 MHz, 2142 LC + * - 0.37 DMIPS/MHz * - 8 kB of on-chip ram * - JTAG debugger (eclipse/GDB/openocd ready) * - Interrupt support diff --git a/src/main/scala/vexriscv/demo/Murax.scala b/src/main/scala/vexriscv/demo/Murax.scala index 95a35b5..d7022f5 100644 --- a/src/main/scala/vexriscv/demo/Murax.scala +++ b/src/main/scala/vexriscv/demo/Murax.scala @@ -22,8 +22,8 @@ import scala.collection.Seq * Created by PIC32F_USER on 28/07/2017. * * Murax is a very light SoC which could work without any external component. - * - ICE40-hx8k + icestorm => 53 Mhz, 2142 LC - * - 0.37 DMIPS/Mhz + * - ICE40-hx8k + icestorm => 53 MHz, 2142 LC + * - 0.37 DMIPS/MHz * - 8 kB of on-chip ram * - JTAG debugger (eclipse/GDB/openocd ready) * - Interrupt support diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index b78f84f..0f46a25 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -1758,17 +1758,17 @@ object FpuSynthesisBench extends App{ } // rotate2_24 -> -// Artix 7 -> 233 Mhz 96 LUT 167 FF -// Artix 7 -> 420 Mhz 86 LUT 229 FF +// Artix 7 -> 233 MHz 96 LUT 167 FF +// Artix 7 -> 420 MHz 86 LUT 229 FF // rotate2_32 -> -// Artix 7 -> 222 Mhz 108 LUT 238 FF -// Artix 7 -> 399 Mhz 110 LUT 300 FF +// Artix 7 -> 222 MHz 108 LUT 238 FF +// Artix 7 -> 399 MHz 110 LUT 300 FF // rotate2_52 -> -// Artix 7 -> 195 Mhz 230 LUT 362 FF -// Artix 7 -> 366 Mhz 225 LUT 486 FF +// Artix 7 -> 195 MHz 230 LUT 362 FF +// Artix 7 -> 366 MHz 225 LUT 486 FF // rotate2_64 -> -// Artix 7 -> 182 Mhz 257 LUT 465 FF -// Artix 7 -> 359 Mhz 266 LUT 591 FF +// Artix 7 -> 182 MHz 257 LUT 465 FF +// Artix 7 -> 359 MHz 266 LUT 591 FF class Rotate2(width : Int) extends Rtl{ override def getName(): String = "rotate2_" + width override def getRtlPath(): String = getName() + ".v" @@ -1858,56 +1858,56 @@ object FpuSynthesisBench extends App{ } //Fpu_32 -> -//Artix 7 -> 136 Mhz 1471 LUT 1336 FF -//Artix 7 -> 196 Mhz 1687 LUT 1371 FF +//Artix 7 -> 136 MHz 1471 LUT 1336 FF +//Artix 7 -> 196 MHz 1687 LUT 1371 FF //Fpu_64 -> -//Artix 7 -> 105 Mhz 2822 LUT 2132 FF -//Artix 7 -> 161 Mhz 3114 LUT 2272 FF +//Artix 7 -> 105 MHz 2822 LUT 2132 FF +//Artix 7 -> 161 MHz 3114 LUT 2272 FF // // // //Fpu_32 -> -//Artix 7 -> 128 Mhz 1693 LUT 1481 FF -//Artix 7 -> 203 Mhz 1895 LUT 1481 FF +//Artix 7 -> 128 MHz 1693 LUT 1481 FF +//Artix 7 -> 203 MHz 1895 LUT 1481 FF //Fpu_64 -> -//Artix 7 -> 99 Mhz 3073 LUT 2396 FF -//Artix 7 -> 164 Mhz 3433 LUT 2432 FF +//Artix 7 -> 99 MHz 3073 LUT 2396 FF +//Artix 7 -> 164 MHz 3433 LUT 2432 FF //Fpu_32 -> -//Artix 7 -> 112 Mhz 1790 LUT 1666 FF -//Artix 7 -> 158 Mhz 1989 LUT 1701 FF +//Artix 7 -> 112 MHz 1790 LUT 1666 FF +//Artix 7 -> 158 MHz 1989 LUT 1701 FF //Fpu_64 -> -//Artix 7 -> 100 Mhz 3294 LUT 2763 FF -//Artix 7 -> 151 Mhz 3708 LUT 2904 FF +//Artix 7 -> 100 MHz 3294 LUT 2763 FF +//Artix 7 -> 151 MHz 3708 LUT 2904 FF //Fpu_32 -> -//Artix 7 -> 139 Mhz 1879 LUT 1713 FF -//Artix 7 -> 206 Mhz 2135 LUT 1723 FF +//Artix 7 -> 139 MHz 1879 LUT 1713 FF +//Artix 7 -> 206 MHz 2135 LUT 1723 FF //Fpu_64 -> -//Artix 7 -> 106 Mhz 3502 LUT 2811 FF -//Artix 7 -> 163 Mhz 3905 LUT 2951 FF +//Artix 7 -> 106 MHz 3502 LUT 2811 FF +//Artix 7 -> 163 MHz 3905 LUT 2951 FF //Fpu_32 -> -//Artix 7 -> 130 Mhz 1889 LUT 1835 FF -//Artix 7 -> 210 Mhz 2131 LUT 1845 FF +//Artix 7 -> 130 MHz 1889 LUT 1835 FF +//Artix 7 -> 210 MHz 2131 LUT 1845 FF //Fpu_64 -> -//Artix 7 -> 106 Mhz 3322 LUT 3023 FF -//Artix 7 -> 161 Mhz 3675 LUT 3163 FF +//Artix 7 -> 106 MHz 3322 LUT 3023 FF +//Artix 7 -> 161 MHz 3675 LUT 3163 FF //Fpu_32 -> -//Artix 7 -> 132 Mhz 1891 LUT 1837 FF -//Artix 7 -> 209 Mhz 2132 LUT 1847 FF +//Artix 7 -> 132 MHz 1891 LUT 1837 FF +//Artix 7 -> 209 MHz 2132 LUT 1847 FF //Fpu_64 -> -//Artix 7 -> 105 Mhz 3348 LUT 3024 FF -//Artix 7 -> 162 Mhz 3712 LUT 3165 FF +//Artix 7 -> 105 MHz 3348 LUT 3024 FF +//Artix 7 -> 162 MHz 3712 LUT 3165 FF //Fpu_32 -> -//Artix 7 -> 128 Mhz 1796 LUT 1727 FF -//Artix 7 -> 208 Mhz 2049 LUT 1727 FF +//Artix 7 -> 128 MHz 1796 LUT 1727 FF +//Artix 7 -> 208 MHz 2049 LUT 1727 FF //Fpu_64 -> -//Artix 7 -> 109 Mhz 3417 LUT 2913 FF -//Artix 7 -> 168 Mhz 3844 LUT 3053 FF +//Artix 7 -> 109 MHz 3417 LUT 2913 FF +//Artix 7 -> 168 MHz 3844 LUT 3053 FF /* testfloat -tininessafter -all1 > all1.txt diff --git a/src/main/scala/vexriscv/plugin/AesPlugin.scala b/src/main/scala/vexriscv/plugin/AesPlugin.scala index 0d4556a..eec48e5 100644 --- a/src/main/scala/vexriscv/plugin/AesPlugin.scala +++ b/src/main/scala/vexriscv/plugin/AesPlugin.scala @@ -53,7 +53,7 @@ import vexriscv.{DecoderService, Stageable, VexRiscv} * - SS specify which byte should be used from RS2 for the processing * * In practice the aes-256-cbc performances should improve by a factor 4. See the following results from libopenssl - * from a SoC running linux at 100 Mhz + * from a SoC running linux at 100 MHz * type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes * aes-256-cbc SW 492.58k 700.22k 796.41k 831.49k 830.09k 832.81k * aes-256 cbc HW 1781.52k 2834.07k 3323.07k 3486.72k 3465.22k 3440.10k diff --git a/src/test/scala/vexriscv/DhrystoneBench.scala b/src/test/scala/vexriscv/DhrystoneBench.scala index 48d1b67..3fabf42 100644 --- a/src/test/scala/vexriscv/DhrystoneBench.scala +++ b/src/test/scala/vexriscv/DhrystoneBench.scala @@ -45,7 +45,7 @@ class DhrystoneBench extends AnyFunSuite { val coremarkIterations = intFind.findFirstIn("Iterations \\: (\\d+.?)+".r.findAllIn(str).toList.last).get.toDouble val coremarkHzs = intFind.findFirstIn("DCLOCKS_PER_SEC=(\\d+.?)+".r.findAllIn(str).toList.last).get.toDouble val coremarkPerMhz = 1e6 * coremarkIterations / coremarkTicks - report ++= s"$name -> $dmips DMIPS/Mhz $coremarkPerMhz Coremark/Mhz\n" + report ++= s"$name -> $dmips DMIPS/MHz $coremarkPerMhz Coremark/MHz\n" } }