fpu fix cmd / commit race condition

This commit is contained in:
Dolu1990 2021-03-02 19:39:55 +01:00
parent 636d53cf63
commit 4bdab667cc
5 changed files with 40 additions and 35 deletions

View File

@ -121,11 +121,12 @@ object TestsWorkspace {
// cd buildroot-build/ // cd buildroot-build/
// make O=$PWD BR2_EXTERNAL=../buildroot-spinal-saxon -C ../buildroot saxon_regression_defconfig // make O=$PWD BR2_EXTERNAL=../buildroot-spinal-saxon -C ../buildroot saxon_regression_defconfig
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no TRACE=yes REDO=100 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=5600000000000ll SEED=45 STOP_ON_ERROR=ye
// export IMAGES=/media/data/open/SaxonSoc/artyA7SmpUpdate/buildroot-regression/buildroot-build/images // export IMAGES=/media/data/open/SaxonSoc/artyA7SmpUpdate/buildroot-regression/buildroot-build/images
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=ye REDO=1 DEBUG=ye WITH_USER_IO=no // make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=no TRACE_START=565000000000ll SEED=45
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=43
// make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=47000000000ll SEED=45
//make clean all IBUS=CACHED IBUS_DATA_WIDTH=64 COMPRESSED=no DBUS=CACHED DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 LRSC=yes AMO=yes SUPERVISOR=yes DBUS_EXCLUSIVE=yes DBUS_INVALIDATE=yes MUL=yes DIV=yes RVF=yes RVD=yes DEBUG_PLUGIN=no LINUX_SOC_SMP=yes EMULATOR=$IMAGES/fw_jump.bin VMLINUX=$IMAGES/Image DTB=$IMAGES/linux.dtb RAMDISK=$IMAGES/rootfs.cpio TRACE=yes REDO=1 DEBUG=ye WITH_USER_IO=no FLOW_INFO=yes TRACE_START=565000000ll SEED=45
val config = VexRiscvSmpClusterGen.vexRiscvConfig( val config = VexRiscvSmpClusterGen.vexRiscvConfig(
hartId = 0, hartId = 0,
ioRange = _ (31 downto 28) === 0xF, ioRange = _ (31 downto 28) === 0xF,
@ -139,7 +140,8 @@ object TestsWorkspace {
dCacheWays = 2, dCacheWays = 2,
withFloat = true, withFloat = true,
withDouble = true, withDouble = true,
externalFpu = false externalFpu = false,
simHalt = true
) )

View File

@ -167,7 +167,8 @@ object VexRiscvSmpClusterGen {
withSupervisor : Boolean = true, withSupervisor : Boolean = true,
withFloat : Boolean = false, withFloat : Boolean = false,
withDouble : Boolean = false, withDouble : Boolean = false,
externalFpu : Boolean = true externalFpu : Boolean = true,
simHalt : Boolean = false
) = { ) = {
assert(iCacheSize/iCacheWays <= 4096, "Instruction cache ways can't be bigger than 4096 bytes") assert(iCacheSize/iCacheWays <= 4096, "Instruction cache ways can't be bigger than 4096 bytes")
assert(dCacheSize/dCacheWays <= 4096, "Data cache ways can't be bigger than 4096 bytes") assert(dCacheSize/dCacheWays <= 4096, "Data cache ways can't be bigger than 4096 bytes")
@ -280,6 +281,7 @@ object VexRiscvSmpClusterGen {
if(withFloat) config.plugins += new FpuPlugin( if(withFloat) config.plugins += new FpuPlugin(
externalFpu = externalFpu, externalFpu = externalFpu,
simHalt = simHalt,
p = FpuParameter(withDouble = withDouble) p = FpuParameter(withDouble = withDouble)
) )
config config

View File

@ -179,25 +179,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}) })
} }
// val completion = for(source <- 0 until portCount) yield new Area{
// def port = io.port(source)
// port.completion.flag.NV := False
// port.completion.flag.DZ := False
// port.completion.flag.OF := False
// port.completion.flag.UF := False
// port.completion.flag.NX := False
//
// val increments = ArrayBuffer[Bool]()
//
// afterElaboration{
// port.completion.count := increments.map(_.asUInt.resize(log2Up(increments.size + 1))).reduceBalancedTree(_ + _)
// }
// }
val commitFork = new Area{ val commitFork = new Area{
val load, commit = Vec(Stream(FpuCommit(p)), portCount) val load, commit = Vec(Stream(FpuCommit(p)), portCount)
for(i <- 0 until portCount){ for(i <- 0 until portCount){
val fork = new StreamFork(FpuCommit(p), 2) val fork = new StreamFork(FpuCommit(p), 2, synchronous = true)
fork.io.input << io.port(i).commit fork.io.input << io.port(i).commit
fork.io.outputs(0) >> load(i) fork.io.outputs(0) >> load(i)
fork.io.outputs(1).pipelined(m2s = true, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload fork.io.outputs(1).pipelined(m2s = true, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload
@ -214,8 +199,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
} }
class CommitArea(source : Int) extends Area{ class CommitArea(source : Int) extends Area{
val pending = new Tracker(4)
val add, mul, div, sqrt, short = new Tracker(4) val add, mul, div, sqrt, short = new Tracker(4)
val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR).toFlow val input = commitFork.commit(source).haltWhen(List(add, mul, div, sqrt, short).map(_.full).orR || !pending.notEmpty).toFlow
when(input.fire){ when(input.fire){
add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR) add.inc setWhen(List(FpuOpcode.ADD).map(input.opcode === _).orR)
@ -224,6 +210,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR) sqrt.inc setWhen(List(FpuOpcode.SQRT).map(input.opcode === _).orR)
short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR) short.inc setWhen(List(FpuOpcode.SGNJ, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR)
rf.scoreboards(source).writes(input.rd) := input.write rf.scoreboards(source).writes(input.rd) := input.write
pending.dec := True
} }
} }
@ -237,7 +224,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val scheduler = for(portId <- 0 until portCount; val scheduler = for(portId <- 0 until portCount;
scoreboard = rf.scoreboards(portId)) yield new Area{ scoreboard = rf.scoreboards(portId)) yield new Area{
val input = io.port(portId).cmd.combStage() val input = io.port(portId).cmd.pipelined(s2m = true)
val useRs1, useRs2, useRs3, useRd = False val useRs1, useRs2, useRs3, useRd = False
switch(input.opcode){ switch(input.opcode){
is(p.Opcode.LOAD) { useRd := True } is(p.Opcode.LOAD) { useRd := True }
@ -265,7 +252,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _) val rfBusy = (rfHits, rfTargets).zipped.map(_ ^ _)
val hits = (0 to 3).map(id => uses(id) && rfBusy(id)) val hits = (0 to 3).map(id => uses(id) && rfBusy(id))
val hazard = hits.orR || !rf.init.done val hazard = hits.orR || !rf.init.done || commitLogic(portId).pending.full
val output = input.haltWhen(hazard) val output = input.haltWhen(hazard)
when(input.valid && rf.init.done){ when(input.valid && rf.init.done){
scoreboard.targetWrite.address := input.rd scoreboard.targetWrite.address := input.rd
@ -273,6 +260,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
} }
when(output.fire && useRd){ when(output.fire && useRd){
scoreboard.targetWrite.valid := True scoreboard.targetWrite.valid := True
commitLogic(portId).pending.inc := True
} }
} }
@ -287,7 +275,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
} }
val read = new Area{ val read = new Area{
val s0 = cmdArbiter.output.pipelined(m2s = true, s2m = true) //TODO may need to remove m2s for store latency val s0 = cmdArbiter.output.pipelined() //TODO may need to remove m2s for store latency
val s1 = s0.m2sPipe() val s1 = s0.m2sPipe()
val output = s1.swapPayload(RfReadOutput()) val output = s1.swapPayload(RfReadOutput())
val rs1Entry = rf.ram.readSync(s0.source @@ s0.rs1,enable = !output.isStall) val rs1Entry = rf.ram.readSync(s0.source @@ s0.rs1,enable = !output.isStall)
@ -982,7 +970,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 } when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 }
// val flag = io.port(input.source).completion.flag
when(forceNan) { when(forceNan) {
output.setNanQuiet output.setNanQuiet
NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling) NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)
@ -1479,8 +1466,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
if (p.withDouble) output.format := input.format if (p.withDouble) output.format := input.format
output.scrap := (mantissa(1) | mantissa(0) | roundingScrap) output.scrap := (mantissa(1) | mantissa(0) | roundingScrap)
// val flag = io.port(input.source).completion.flag
output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling
output.DZ := False output.DZ := False
when(forceNan) { when(forceNan) {

View File

@ -10,6 +10,7 @@ import vexriscv.ip.fpu._
import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.ArrayBuffer
class FpuPlugin(externalFpu : Boolean = false, class FpuPlugin(externalFpu : Boolean = false,
simHalt : Boolean = false,
p : FpuParameter) extends Plugin[VexRiscv] with VexRiscvRegressionArg { p : FpuParameter) extends Plugin[VexRiscv] with VexRiscvRegressionArg {
object FPU_ENABLE extends Stageable(Bool()) object FPU_ENABLE extends Stageable(Bool())
@ -222,10 +223,20 @@ class FpuPlugin(externalFpu : Boolean = false,
val internal = (!externalFpu).generate (pipeline plug new Area{ val internal = (!externalFpu).generate (pipeline plug new Area{
val fpu = FpuCore(1, p) val fpu = FpuCore(1, p)
if(simHalt) {
val cmdHalt = in(Bool).setName("fpuCmdHalt").addAttribute(Verilator.public)
val commitHalt = in(Bool).setName("fpuCommitHalt").addAttribute(Verilator.public)
val rspHalt = in(Bool).setName("fpuRspHalt").addAttribute(Verilator.public)
fpu.io.port(0).cmd << port.cmd.haltWhen(cmdHalt)
fpu.io.port(0).commit << port.commit.haltWhen(commitHalt)
fpu.io.port(0).rsp.haltWhen(rspHalt) >> port.rsp
fpu.io.port(0).completion <> port.completion
} else {
fpu.io.port(0).cmd << port.cmd fpu.io.port(0).cmd << port.cmd
fpu.io.port(0).commit << port.commit fpu.io.port(0).commit << port.commit
fpu.io.port(0).rsp >> port.rsp fpu.io.port(0).rsp >> port.rsp
fpu.io.port(0).completion <> port.completion fpu.io.port(0).completion <> port.completion
}
}) })

View File

@ -1848,6 +1848,11 @@ public:
instanceCycles += 1; instanceCycles += 1;
for(SimElement* simElement : simElements) simElement->postCycle(); for(SimElement* simElement : simElements) simElement->postCycle();
#ifdef RVF
top->fpuCmdHalt = VL_RANDOM_I(1);
top->fpuCommitHalt = VL_RANDOM_I(1);
top->fpuRspHalt = VL_RANDOM_I(1);
#endif
@ -3815,10 +3820,10 @@ string riscvTestMemory[] = {
string riscvTestFloat[] = { string riscvTestFloat[] = {
"rv32uf-p-fmadd",
"rv32uf-p-fadd", "rv32uf-p-fadd",
"rv32uf-p-fcmp", "rv32uf-p-fcmp",
"rv32uf-p-fcvt_w", "rv32uf-p-fcvt_w",
"rv32uf-p-fmadd",
"rv32uf-p-ldst", "rv32uf-p-ldst",
"rv32uf-p-recoding", "rv32uf-p-recoding",
"rv32uf-p-fclass", "rv32uf-p-fclass",
@ -3830,9 +3835,9 @@ string riscvTestFloat[] = {
string riscvTestDouble[] = { string riscvTestDouble[] = {
"rv32ud-p-fmadd",
"rv32ud-p-fadd", "rv32ud-p-fadd",
"rv32ud-p-fcvt", "rv32ud-p-fcvt",
"rv32ud-p-fmadd",
"rv32ud-p-recoding", "rv32ud-p-recoding",
"rv32ud-p-fclass", "rv32ud-p-fclass",
"rv32ud-p-fcvt_w", "rv32ud-p-fcvt_w",