From 3f226b758c9a1a9706dae1a16368bc447bdf3200 Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Fri, 19 Feb 2021 13:03:48 +0100 Subject: [PATCH] fpu fix exception flag handeling --- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 100 +++++++++++------- .../scala/vexriscv/ip/fpu/Interface.scala | 13 +-- .../scala/vexriscv/plugin/FpuPlugin.scala | 18 ++-- src/test/cpp/regression/main.cpp | 13 ++- src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 13 ++- 5 files changed, 94 insertions(+), 63 deletions(-) diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index 4a2e208..a241099 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -125,6 +125,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val scrap = Bool() val roundMode = FpuRoundMode() val format = p.withDouble generate FpuFormat() + val NV = Bool() + val DZ = Bool() //TODO } case class RoundOutput() extends Bundle{ @@ -133,6 +135,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val rd = p.rfAddress() val value = p.internalFloating() val format = p.withDouble generate FpuFormat() + val NV, NX, OF, UF, DZ = Bool() + val write = Bool() } val rf = new Area{ @@ -153,20 +157,20 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val lockFreeId = OHMasking.first(lock.map(!_.valid)) } - val completion = for(source <- 0 until portCount) yield new Area{ - def port = io.port(source) - port.completion.flag.NV := False - port.completion.flag.DZ := False - port.completion.flag.OF := False - port.completion.flag.UF := False - port.completion.flag.NX := False - - val increments = ArrayBuffer[Bool]() - - afterElaboration{ - port.completion.count := increments.map(_.asUInt.resize(log2Up(increments.size + 1))).reduceBalancedTree(_ + _) - } - } +// val completion = for(source <- 0 until portCount) yield new Area{ +// def port = io.port(source) +// port.completion.flag.NV := False +// port.completion.flag.DZ := False +// port.completion.flag.OF := False +// port.completion.flag.UF := False +// port.completion.flag.NX := False +// +// val increments = ArrayBuffer[Bool]() +// +// afterElaboration{ +// port.completion.count := increments.map(_.asUInt.resize(log2Up(increments.size + 1))).reduceBalancedTree(_ + _) +// } +// } val commitFork = new Area{ val load, commit = Vec(Stream(FpuCommit(p)), portCount) @@ -522,6 +526,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ output.value.mantissa := recoded.mantissa @@ U"0" output.value.special := recoded.special output.scrap := False + output.NV := False + output.DZ := False when(input.i2f){ output.value.sign := i2fSign output.value.exponent := (U(exponentOne+31) - fsm.shift.by).resized @@ -534,6 +540,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ output.value.mantissa := U(i2fHigh) @@ (if(p.withDouble) U"0" else U"") } } + } val shortPip = new Area{ @@ -543,8 +550,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val result = p.storeLoadType().assignDontCare() - val flag = io.port(input.source).completion.flag - val halt = False val recodedResult = p.storeLoadType() val f32 = new Area{ @@ -677,7 +682,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } } - + val rspNv = False + val rspNx = False val f2i = new Area{ //Will not work for 64 bits float max value rounding val unsigned = fsm.shift.output(32 downto 0) >> 1 @@ -703,9 +709,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val low = overflow val high = input.arg(0) ^ overflow result := (31 -> high, default -> low) - flag.NV := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero + rspNv := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && !isZero } otherwise { - flag.NX := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0 + rspNx := input.valid && input.opcode === FpuOpcode.F2I && fsm.done && round =/= 0 } } @@ -805,9 +811,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val rs2Nan = input.rs2.isNan val rs1NanNv = input.rs1.isNan && (!input.rs1.isQuiet || signalQuiet) val rs2NanNv = input.rs2.isNan && (!input.rs2.isQuiet || signalQuiet) - val nv = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv || + val NV = List(FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR && rs1NanNv || List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv - flag.NV setWhen(input.valid && nv) + rspNv setWhen(NV) val rspStreams = Vec(Stream(FpuRsp(p)), portCount) input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source)) @@ -815,9 +821,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ def rsp = rspStreams(i) rsp.valid := input.valid && input.source === i && !toFpuRf && !halt rsp.value := result + rsp.NV := rspNv + rsp.NX := rspNx io.port(i).rsp << rsp.stage() - completion(i).increments += (RegNext(rsp.fire) init(False)) } + + + rfOutput.NV := NV + rfOutput.DZ := False } val mul = p.withMul generate new Area{ @@ -891,13 +902,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ output.exponent := (exp - exponentOne).resized output.mantissa := man.asUInt output.setNormal + val NV = False when(exp(exp.getWidth-3, 3 bits) >= 5) { output.exponent(p.internalExponentSize-2, 2 bits) := 3 } - val flag = io.port(input.source).completion.flag +// val flag = io.port(input.source).completion.flag when(forceNan) { output.setNanQuiet - flag.NV setWhen(input.valid && (infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + NV setWhen(input.valid && (infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) } elsewhen(forceOverflow) { output.setInfinity } elsewhen(forceZero) { @@ -909,6 +921,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val result = new Area { def input = norm.input + def NV = norm.NV + val notMul = new Area { val output = Flow(UInt(p.internalMantissaSize + 1 bits)) output.valid := input.valid && input.divSqrt @@ -924,6 +938,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ output.roundMode := input.roundMode output.scrap := norm.scrap output.value := norm.output + output.NV := NV + output.DZ := False decode.mulToAdd.valid := input.valid && input.add decode.mulToAdd.source := input.source @@ -1245,8 +1261,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ output.scrap := (mantissa(1) | mantissa(0) | roundingScrap) - val flag = io.port(input.source).completion.flag - flag.NV setWhen (input.valid && (infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) +// val flag = io.port(input.source).completion.flag + output.NV := (input.valid && (infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + output.DZ := False when(forceNan) { output.value.setNanQuiet } elsewhen (forceZero) { @@ -1272,8 +1289,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ if(p.withMul) (inputs += mul.result.output) if(p.withShortPipMisc) (inputs += shortPip.rfOutput.pipelined(m2s = true)) val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs) - val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId) - val commited = arbitrated.haltWhen(!isCommited).toFlow } class RoundFront extends MergeInput{ @@ -1283,7 +1298,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val roundFront = new Area { - val input = merge.commited.stage() + val input = merge.arbitrated.stage() val output = input.swapPayload(new RoundFront()) output.payload.assignSomeByName(input.payload) @@ -1313,7 +1328,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val roundBack = new Area{ val input = roundFront.output.stage() - val output = input.swapPayload(RoundOutput()) + val isCommited = rf.lock.map(_.commited).read(input.lockId) + val output = input.haltWhen(!isCommited).toFlow.swapPayload(RoundOutput()) import input.payload._ val math = p.internalFloating() @@ -1375,15 +1391,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ nx setWhen(!input.value.special && (roundAdjusted =/= 0)) - when(input.valid){ - val flag = io.port(input.source).completion.flag - flag.NX setWhen(nx) - flag.OF setWhen(of) - flag.UF setWhen(uf) - } + val write = rf.lock.map(_.write).read(input.lockId) + output.NX := nx & write + output.OF := of & write + output.UF := uf & write + output.NV := input.NV & write + output.DZ := input.DZ & write output.source := input.source output.lockId := input.lockId output.rd := input.rd + output.write := write if(p.withDouble) output.format := input.format output.value := patched } @@ -1392,7 +1409,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val input = roundBack.output.stage() for(i <- 0 until portCount){ - completion(i).increments += (RegNext(input.fire && input.source === i) init(False)) + val c = io.port(i).completion + c.valid := input.fire && input.source === i + c.flags.NX := input.NX + c.flags.OF := input.OF + c.flags.UF := input.UF + c.flags.NV := input.NV + c.flags.DZ := input.DZ + c.written := input.write } when(input.valid){ @@ -1402,7 +1426,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val port = rf.ram.writePort - port.valid := input.valid && rf.lock.map(_.write).read(input.lockId) + port.valid := input.valid && input.write port.address := input.source @@ input.rd port.data.value := input.value if(p.withDouble) port.data.boxed := input.format === FpuFormat.FLOAT diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala index 3f6542c..9e02161 100644 --- a/src/main/scala/vexriscv/ip/fpu/Interface.scala +++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -136,14 +136,8 @@ case class FpuFlags() extends Bundle{ } case class FpuCompletion() extends Bundle{ - val flag = FpuFlags() - val count = UInt(2 bits) - - def stage() = { - val ret = FpuCompletion().setCompositeName(this, "stage", true) - ret := this - ret - } + val flags = FpuFlags() + val written = Bool() //Used for verification purposes } case class FpuCmd(p : FpuParameter) extends Bundle{ @@ -163,13 +157,14 @@ case class FpuCommit(p : FpuParameter) extends Bundle{ case class FpuRsp(p : FpuParameter) extends Bundle{ val value = p.storeLoadType() // IEEE754 store || Integer + val NV, NX = Bool() } case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave { val cmd = Stream(FpuCmd(p)) val commit = Stream(FpuCommit(p)) val rsp = Stream(FpuRsp(p)) - val completion = FpuCompletion() + val completion = Flow(FpuCompletion()) override def asMaster(): Unit = { master(cmd, commit) diff --git a/src/main/scala/vexriscv/plugin/FpuPlugin.scala b/src/main/scala/vexriscv/plugin/FpuPlugin.scala index b42fd17..decf981 100644 --- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala +++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala @@ -140,7 +140,7 @@ class FpuPlugin(externalFpu : Boolean = false, } //TODO FMV_X_X + doubles - port = FpuPort(p) + port = FpuPort(p).addTag(Verilator.public) if(externalFpu) master(port) val dBusEncoding = pipeline.service(classOf[DBusEncodingService]) @@ -168,16 +168,16 @@ class FpuPlugin(externalFpu : Boolean = false, val csr = pipeline plug new Area{ val pendings = Reg(UInt(5 bits)) init(0) - pendings := pendings + U(port.cmd.fire) - port.completion.count + pendings := pendings + U(port.cmd.fire) - U(port.completion.fire) - U(port.rsp.fire) val hasPending = pendings =/= 0 val flags = Reg(FpuFlags()) - flags.NV init(False) setWhen(port.completion.flag.NV) - flags.DZ init(False) setWhen(port.completion.flag.DZ) - flags.OF init(False) setWhen(port.completion.flag.OF) - flags.UF init(False) setWhen(port.completion.flag.UF) - flags.NX init(False) setWhen(port.completion.flag.NX) + flags.NV init(False) setWhen(port.completion.fire && port.completion.flags.NV) + flags.DZ init(False) setWhen(port.completion.fire && port.completion.flags.DZ) + flags.OF init(False) setWhen(port.completion.fire && port.completion.flags.OF) + flags.UF init(False) setWhen(port.completion.fire && port.completion.flags.UF) + flags.NX init(False) setWhen(port.completion.fire && port.completion.flags.NX) val service = pipeline.service(classOf[CsrInterface]) val rm = Reg(Bits(3 bits)) init(0) @@ -244,6 +244,10 @@ class FpuPlugin(externalFpu : Boolean = false, when(arbitration.isValid) { dBusEncoding.bypassStore(storeFormated) output(REGFILE_WRITE_DATA) := port.rsp.value(31 downto 0) + when(!arbitration.isStuck && !arbitration.isRemoved){ + csr.flags.NV setWhen(port.rsp.NV) + csr.flags.NX setWhen(port.rsp.NX) + } } when(!port.rsp.valid){ arbitration.haltByOther := True diff --git a/src/test/cpp/regression/main.cpp b/src/test/cpp/regression/main.cpp index 4d50019..f2cf452 100644 --- a/src/test/cpp/regression/main.cpp +++ b/src/test/cpp/regression/main.cpp @@ -239,6 +239,9 @@ class success : public std::exception { }; #define MSTATUS_READ_MASK 0x1888 #endif +#define u32 uint32_t +#define u32 uint64_t + class RiscvGolden { public: @@ -4043,26 +4046,26 @@ int main(int argc, char **argv, char **env) { #endif for(const string &name : riscvTestMain){ - redo(REDO,RiscvTest(name).run();) + redo(REDO,RiscvTest(name).withRiscvRef()->run();) } for(const string &name : riscvTestMemory){ - redo(REDO,RiscvTest(name).run();) + redo(REDO,RiscvTest(name).withRiscvRef()->run();) } #ifdef MUL for(const string &name : riscvTestMul){ - redo(REDO,RiscvTest(name).run();) + redo(REDO,RiscvTest(name).withRiscvRef()->run();) } #endif #ifdef DIV for(const string &name : riscvTestDiv){ - redo(REDO,RiscvTest(name).run();) + redo(REDO,RiscvTest(name).withRiscvRef()->run();) } #endif #ifdef COMPRESSED - redo(REDO,RiscvTest("rv32uc-p-rvc").bootAt(0x800000FCu)->run()); + redo(REDO,RiscvTest("rv32uc-p-rvc").withRiscvRef()->bootAt(0x800000FCu)->run()); #endif #if defined(CSR) && !defined(CSR_SKIP_TEST) diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala index 7f37a62..163460d 100644 --- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala +++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala @@ -55,13 +55,13 @@ class FpuTest extends FunSuite{ } def testP(p : FpuParameter){ - val portCount = 1 + val portCount = 4 val config = SimConfig config.allOptimisation // if(p.withDouble) config.withFstWave config.compile(new FpuCore(portCount, p){ - for(i <- 0 until portCount) out(Bits(5 bits)).setName(s"flagAcc$i") := io.port(i).completion.flag.asBits + for(i <- 0 until portCount) out(Bits(5 bits)).setName(s"flagAcc$i") := io.port(i).completion.flags.asBits setDefinitionName("FpuCore"+ (if(p.withDouble) "Double" else "")) }).doSim(seed = 42){ dut => dut.clockDomain.forkStimulus(10) @@ -228,8 +228,10 @@ class FpuTest extends FunSuite{ val flagAggregated = dut.reflectBaseType(s"flagAcc$id").asInstanceOf[Bits] dut.clockDomain.onSamplings{ val c = dut.io.port(id).completion - pendingMiaou -= c.count.toInt - flagAccumulator |= flagAggregated.toInt + if(c.valid.toBoolean) { + pendingMiaou -= 1 + flagAccumulator |= flagAggregated.toInt + } dut.writeback.randomSim.randomize() } @@ -242,6 +244,9 @@ class FpuTest extends FunSuite{ StreamMonitor(dut.io.port(id)rsp, dut.clockDomain){payload => + pendingMiaou -= 1 + if(payload.NV.toBoolean) flagAccumulator |= 1 << 4 + if(payload.NX.toBoolean) flagAccumulator |= 1 << 0 rspQueue.dequeue().apply(payload) }