From 4bdab667cccc5b4be33cb1666ecb1a1f5a0e5dad Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Tue, 2 Mar 2021 19:39:55 +0100 Subject: [PATCH] fpu fix cmd / commit race condition --- src/main/scala/vexriscv/TestsWorkspace.scala | 12 ++++--- .../demo/smp/VexRiscvSmpCluster.scala | 4 ++- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 31 +++++-------------- .../scala/vexriscv/plugin/FpuPlugin.scala | 19 +++++++++--- src/test/cpp/regression/main.cpp | 9 ++++-- 5 files changed, 40 insertions(+), 35 deletions(-) diff --git a/src/main/scala/vexriscv/TestsWorkspace.scala b/src/main/scala/vexriscv/TestsWorkspace.scala index f4347f4..c961f05 100644 --- a/src/main/scala/vexriscv/TestsWorkspace.scala +++ b/src/main/scala/vexriscv/TestsWorkspace.scala @@ -121,11 +121,12 @@ object TestsWorkspace { // cd buildroot-build/ // make O=$PWD BR2_EXTERNAL=../buildroot-spinal-saxon -C ../buildroot saxon_regression_defconfig + //make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45 + + //make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=100 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=5600000000000ll SEED=45 STOP_ON_ERROR=ye + // export IMAGES=/media/data/open/SaxonSoc/artyA7SmpUpdate/buildroot-regression/buildroot-build/images - // make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=ye REDO=1 DEBUG=ye WITH_USER_IO=no - // make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=43 - // make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=45 - //make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=565000000ll SEED=45 + // make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45 val config = VexRiscvSmpClusterGen.vexRiscvConfig( hartId = 0, ioRange = _ (31 downto 28) === 0xF, @@ -139,7 +140,8 @@ object TestsWorkspace { dCacheWays = 2, withFloat = true, withDouble = true, - externalFpu = false + externalFpu = false, + simHalt = true ) diff --git a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala index 0e3018b..177187e 100644 --- a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala +++ b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala @@ -167,7 +167,8 @@ object VexRiscvSmpClusterGen { withSupervisor : Boolean = true, withFloat : Boolean = false, withDouble : Boolean = false, - externalFpu : Boolean = true + externalFpu : Boolean = true, + simHalt : Boolean = false ) = { assert(iCacheSize/iCacheWays <= 4096, "Instruction cache ways can't be bigger than 4096 bytes") assert(dCacheSize/dCacheWays <= 4096, "Data cache ways can't be bigger than 4096 bytes") @@ -280,6 +281,7 @@ object VexRiscvSmpClusterGen { if(withFloat) config.plugins += new FpuPlugin( externalFpu = externalFpu, + simHalt = simHalt, p = FpuParameter(withDouble = withDouble) ) config diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index 2dba93a..5c81421 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -179,25 +179,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ }) } -// val completion = for(source <- 0 until portCount) yield new Area{ -// def port = io.port(source) -// port.completion.flag.NV := False -// port.completion.flag.DZ := False -// port.completion.flag.OF := False -// port.completion.flag.UF := False -// port.completion.flag.NX := False -// -// val increments = ArrayBuffer[Bool]() -// -// afterElaboration{ -// port.completion.count := increments.map(_.asUInt.resize(log2Up(increments.size + 1))).reduceBalancedTree(_ + _) -// } -// } - val commitFork = new Area{ val load, commit = Vec(Stream(FpuCommit(p)), portCount) for(i <- 0 until portCount){ - val fork = new StreamFork(FpuCommit(p), 2) + val fork = new StreamFork(FpuCommit(p), 2, synchronous = true) fork.io.input << io.port(i).commit fork.io.outputs(0) >> load(i) fork.io.outputs(1).pipelined(m2s = true, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload @@ -214,8 +199,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } class CommitArea(source : Int) extends Area{ + val pending = new Tracker(4) val add, mul, div, sqrt, short = new Tracker(4) - val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR).toFlow + val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow when(input.fire){ add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR) @@ -224,6 +210,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR) short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR) rf.scoreboards(source).writes(input.rd) := input.write + pending.dec := True } } @@ -237,7 +224,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val scheduler = for(portId <- 0 until portCount; scoreboard = rf.scoreboards(portId)) yield new Area{ - val input = io.port(portId).cmd.combStage() + val input = io.port(portId).cmd.pipelined(s2m = true) val useRs1, useRs2, useRs3, useRd = False switch(input.opcode){ is(p.Opcode.LOAD) { useRd := True } @@ -265,7 +252,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _) val hits = (0 to 3).map(id => uses(id) && rfBusy(id)) - val hazard = hits.orR || !rf.init.done + val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full val output = input.haltWhen(hazard) when(input.valid && rf.init.done){ scoreboard.targetWrite.address := input.rd @@ -273,6 +260,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } when(output.fire && useRd){ scoreboard.targetWrite.valid := True + commitLogic(portId).pending.inc := True } } @@ -287,7 +275,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val read = new Area{ - val s0 = cmdArbiter.output.pipelined(m2s = true, s2m = true) //TODO may need to remove m2s for store latency + val s0 = cmdArbiter.output.pipelined() //TODO may need to remove m2s for store latency val s1 = s0.m2sPipe() val output = s1.swapPayload(RfReadOutput()) val rs1Entry = rf.ram.readSync(s0.source @@ s0.rs1,enable = !output.isStall) @@ -982,7 +970,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 } -// val flag = io.port(input.source).completion.flag when(forceNan) { output.setNanQuiet NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling) @@ -1479,8 +1466,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ if (p.withDouble) output.format := input.format output.scrap := (mantissa(1) | mantissa(0) | roundingScrap) - -// val flag = io.port(input.source).completion.flag output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling output.DZ := False when(forceNan) { diff --git a/src/main/scala/vexriscv/plugin/FpuPlugin.scala b/src/main/scala/vexriscv/plugin/FpuPlugin.scala index a49d4c9..6aa6831 100644 --- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala +++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala @@ -10,6 +10,7 @@ import vexriscv.ip.fpu._ import scala.collection.mutable.ArrayBuffer class FpuPlugin(externalFpu : Boolean = false, + simHalt : Boolean = false, p : FpuParameter) extends Plugin[VexRiscv] with VexRiscvRegressionArg { object FPU_ENABLE extends Stageable(Bool()) @@ -222,10 +223,20 @@ class FpuPlugin(externalFpu : Boolean = false, val internal = (!externalFpu).generate (pipeline plug new Area{ val fpu = FpuCore(1, p) - fpu.io.port(0).cmd << port.cmd - fpu.io.port(0).commit << port.commit - fpu.io.port(0).rsp >> port.rsp - fpu.io.port(0).completion <> port.completion + if(simHalt) { + val cmdHalt = in(Bool).setName("fpuCmdHalt").addAttribute(Verilator.public) + val commitHalt = in(Bool).setName("fpuCommitHalt").addAttribute(Verilator.public) + val rspHalt = in(Bool).setName("fpuRspHalt").addAttribute(Verilator.public) + fpu.io.port(0).cmd << port.cmd.haltWhen(cmdHalt) + fpu.io.port(0).commit << port.commit.haltWhen(commitHalt) + fpu.io.port(0).rsp.haltWhen(rspHalt) >> port.rsp + fpu.io.port(0).completion <> port.completion + } else { + fpu.io.port(0).cmd << port.cmd + fpu.io.port(0).commit << port.commit + fpu.io.port(0).rsp >> port.rsp + fpu.io.port(0).completion <> port.completion + } }) diff --git a/src/test/cpp/regression/main.cpp b/src/test/cpp/regression/main.cpp index ac9417b..e58aba3 100644 --- a/src/test/cpp/regression/main.cpp +++ b/src/test/cpp/regression/main.cpp @@ -1848,6 +1848,11 @@ public: instanceCycles += 1; for(SimElement* simElement : simElements) simElement->postCycle(); + #ifdef RVF + top->fpuCmdHalt = VL_RANDOM_I(1); + top->fpuCommitHalt = VL_RANDOM_I(1); + top->fpuRspHalt = VL_RANDOM_I(1); + #endif @@ -3815,10 +3820,10 @@ string riscvTestMemory[] = { string riscvTestFloat[] = { + "rv32uf-p-fmadd", "rv32uf-p-fadd", "rv32uf-p-fcmp", "rv32uf-p-fcvt_w", - "rv32uf-p-fmadd", "rv32uf-p-ldst", "rv32uf-p-recoding", "rv32uf-p-fclass", @@ -3830,9 +3835,9 @@ string riscvTestFloat[] = { string riscvTestDouble[] = { + "rv32ud-p-fmadd", "rv32ud-p-fadd", "rv32ud-p-fcvt", - "rv32ud-p-fmadd", "rv32ud-p-recoding", "rv32ud-p-fclass", "rv32ud-p-fcvt_w",