fpu fix cmd / commit race condition

This commit is contained in:
Dolu1990 2021-03-02 19:39:55 +01:00
parent 636d53cf63
commit 4bdab667cc
5 changed files with 40 additions and 35 deletions

View File

@ -121,11 +121,12 @@ object TestsWorkspace {
// cd buildroot-build/
// make O=$PWD BR2_EXTERNAL=../buildroot-spinal-saxon -C ../buildroot saxon_regression_defconfig
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=100 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=5600000000000ll SEED=45 STOP_ON_ERROR=ye
// export IMAGES=/media/data/open/SaxonSoc/artyA7SmpUpdate/buildroot-regression/buildroot-build/images
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=ye REDO=1 DEBUG=ye WITH_USER_IO=no
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=43
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=45
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=565000000ll SEED=45
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45
val config = VexRiscvSmpClusterGen.vexRiscvConfig(
hartId = 0,
ioRange = _ (31 downto 28) === 0xF,
@ -139,7 +140,8 @@ object TestsWorkspace {
dCacheWays = 2,
withFloat = true,
withDouble = true,
externalFpu = false
externalFpu = false,
simHalt = true
)

View File

@ -167,7 +167,8 @@ object VexRiscvSmpClusterGen {
withSupervisor : Boolean = true,
withFloat : Boolean = false,
withDouble : Boolean = false,
externalFpu : Boolean = true
externalFpu : Boolean = true,
simHalt : Boolean = false
) = {
assert(iCacheSize/iCacheWays <= 4096, "Instruction cache ways can't be bigger than 4096 bytes")
assert(dCacheSize/dCacheWays <= 4096, "Data cache ways can't be bigger than 4096 bytes")
@ -280,6 +281,7 @@ object VexRiscvSmpClusterGen {
if(withFloat) config.plugins += new FpuPlugin(
externalFpu = externalFpu,
simHalt = simHalt,
p = FpuParameter(withDouble = withDouble)
)
config

View File

@ -179,25 +179,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
})
}
// val completion = for(source <- 0 until portCount) yield new Area{
// def port = io.port(source)
// port.completion.flag.NV := False
// port.completion.flag.DZ := False
// port.completion.flag.OF := False
// port.completion.flag.UF := False
// port.completion.flag.NX := False
//
// val increments = ArrayBuffer[Bool]()
//
// afterElaboration{
// port.completion.count := increments.map(_.asUInt.resize(log2Up(increments.size + 1))).reduceBalancedTree(_ + _)
// }
// }
val commitFork = new Area{
val load, commit = Vec(Stream(FpuCommit(p)), portCount)
for(i <- 0 until portCount){
val fork = new StreamFork(FpuCommit(p), 2)
val fork = new StreamFork(FpuCommit(p), 2, synchronous = true)
fork.io.input << io.port(i).commit
fork.io.outputs(0) >> load(i)
fork.io.outputs(1).pipelined(m2s = true, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload
@ -214,8 +199,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}
class CommitArea(source : Int) extends Area{
val pending = new Tracker(4)
val add, mul, div, sqrt, short = new Tracker(4)
val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR).toFlow
val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow
when(input.fire){
add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR)
@ -224,6 +210,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR)
short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR)
rf.scoreboards(source).writes(input.rd) := input.write
pending.dec := True
}
}
@ -237,7 +224,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val scheduler = for(portId <- 0 until portCount;
scoreboard = rf.scoreboards(portId)) yield new Area{
val input = io.port(portId).cmd.combStage()
val input = io.port(portId).cmd.pipelined(s2m = true)
val useRs1, useRs2, useRs3, useRd = False
switch(input.opcode){
is(p.Opcode.LOAD) { useRd := True }
@ -265,7 +252,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _)
val hits = (0 to 3).map(id => uses(id) && rfBusy(id))
val hazard = hits.orR || !rf.init.done
val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full
val output = input.haltWhen(hazard)
when(input.valid && rf.init.done){
scoreboard.targetWrite.address := input.rd
@ -273,6 +260,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}
when(output.fire && useRd){
scoreboard.targetWrite.valid := True
commitLogic(portId).pending.inc := True
}
}
@ -287,7 +275,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}
val read = new Area{
val s0 = cmdArbiter.output.pipelined(m2s = true, s2m = true) //TODO may need to remove m2s for store latency
val s0 = cmdArbiter.output.pipelined() //TODO may need to remove m2s for store latency
val s1 = s0.m2sPipe()
val output = s1.swapPayload(RfReadOutput())
val rs1Entry = rf.ram.readSync(s0.source @@ s0.rs1,enable = !output.isStall)
@ -982,7 +970,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 }
// val flag = io.port(input.source).completion.flag
when(forceNan) {
output.setNanQuiet
NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)
@ -1479,8 +1466,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
if (p.withDouble) output.format := input.format
output.scrap := (mantissa(1) | mantissa(0) | roundingScrap)
// val flag = io.port(input.source).completion.flag
output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling
output.DZ := False
when(forceNan) {

View File

@ -10,6 +10,7 @@ import vexriscv.ip.fpu._
import scala.collection.mutable.ArrayBuffer
class FpuPlugin(externalFpu : Boolean = false,
simHalt : Boolean = false,
p : FpuParameter) extends Plugin[VexRiscv] with VexRiscvRegressionArg {
object FPU_ENABLE extends Stageable(Bool())
@ -222,10 +223,20 @@ class FpuPlugin(externalFpu : Boolean = false,
val internal = (!externalFpu).generate (pipeline plug new Area{
val fpu = FpuCore(1, p)
if(simHalt) {
val cmdHalt = in(Bool).setName("fpuCmdHalt").addAttribute(Verilator.public)
val commitHalt = in(Bool).setName("fpuCommitHalt").addAttribute(Verilator.public)
val rspHalt = in(Bool).setName("fpuRspHalt").addAttribute(Verilator.public)
fpu.io.port(0).cmd << port.cmd.haltWhen(cmdHalt)
fpu.io.port(0).commit << port.commit.haltWhen(commitHalt)
fpu.io.port(0).rsp.haltWhen(rspHalt) >> port.rsp
fpu.io.port(0).completion <> port.completion
} else {
fpu.io.port(0).cmd << port.cmd
fpu.io.port(0).commit << port.commit
fpu.io.port(0).rsp >> port.rsp
fpu.io.port(0).completion <> port.completion
}
})

View File

@ -1848,6 +1848,11 @@ public:
instanceCycles += 1;
for(SimElement* simElement : simElements) simElement->postCycle();
#ifdef RVF
top->fpuCmdHalt = VL_RANDOM_I(1);
top->fpuCommitHalt = VL_RANDOM_I(1);
top->fpuRspHalt = VL_RANDOM_I(1);
#endif
@ -3815,10 +3820,10 @@ string riscvTestMemory[] = {
string riscvTestFloat[] = {
"rv32uf-p-fmadd",
"rv32uf-p-fadd",
"rv32uf-p-fcmp",
"rv32uf-p-fcvt_w",
"rv32uf-p-fmadd",
"rv32uf-p-ldst",
"rv32uf-p-recoding",
"rv32uf-p-fclass",
@ -3830,9 +3835,9 @@ string riscvTestFloat[] = {
string riscvTestDouble[] = {
"rv32ud-p-fmadd",
"rv32ud-p-fadd",
"rv32ud-p-fcvt",
"rv32ud-p-fmadd",
"rv32ud-p-recoding",
"rv32ud-p-fclass",
"rv32ud-p-fcvt_w",