From e504afbf1897fcec8037473e38dc2f92525dfcb1 Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Fri, 19 Feb 2021 11:26:28 +0100 Subject: [PATCH] fpu integration wip, got mandelbrot to work in linux with no inline (crash when inlined) --- src/main/scala/vexriscv/TestsWorkspace.scala | 2 +- .../demo/smp/VexRiscvSmpCluster.scala | 17 ++++- src/main/scala/vexriscv/ip/DataCache.scala | 2 +- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 65 ++++++++++++------- .../scala/vexriscv/ip/fpu/Interface.scala | 6 ++ .../vexriscv/plugin/DBusCachedPlugin.scala | 7 ++ .../scala/vexriscv/plugin/FpuPlugin.scala | 7 +- 7 files changed, 75 insertions(+), 31 deletions(-) diff --git a/src/main/scala/vexriscv/TestsWorkspace.scala b/src/main/scala/vexriscv/TestsWorkspace.scala index ac07cb8..bcda45f 100644 --- a/src/main/scala/vexriscv/TestsWorkspace.scala +++ b/src/main/scala/vexriscv/TestsWorkspace.scala @@ -33,7 +33,7 @@ import vexriscv.ip.fpu.FpuParameter object TestsWorkspace { def main(args: Array[String]) { SpinalConfig().generateVerilog { -// make clean all REDO=10 CSR=no MMU=no COREMARK=no RVF=no REDO=1 DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 DEBUG=ye TRACE=ye + // make clean all REDO=10 CSR=no MMU=no COREMARK=no RVF=yes RVD=yes REDO=1 DBUS_LOAD_DATA_WIDTH=64 DBUS_STORE_DATA_WIDTH=64 DEBUG=ye TRACE=ye val config = VexRiscvConfig( plugins = List( new IBusCachedPlugin( diff --git a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala index d3da4b3..5b66670 100644 --- a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala +++ b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala @@ -15,12 +15,13 @@ import spinal.lib.generator.Handle import spinal.lib.misc.plic.PlicMapping import spinal.lib.system.debugger.SystemDebuggerConfig import vexriscv.ip.{DataCacheAck, DataCacheConfig, DataCacheMemBus, InstructionCache, InstructionCacheConfig} -import vexriscv.plugin.{BranchPlugin, CsrAccess, CsrPlugin, CsrPluginConfig, DBusCachedPlugin, DBusSimplePlugin, DYNAMIC_TARGET, DebugPlugin, DecoderSimplePlugin, FullBarrelShifterPlugin, HazardSimplePlugin, IBusCachedPlugin, IBusSimplePlugin, IntAluPlugin, MmuPlugin, MmuPortConfig, MulDivIterativePlugin, MulPlugin, RegFilePlugin, STATIC, SrcPlugin, StaticMemoryTranslatorPlugin, YamlPlugin} +import vexriscv.plugin.{BranchPlugin, CsrAccess, CsrPlugin, CsrPluginConfig, DBusCachedPlugin, DBusSimplePlugin, DYNAMIC_TARGET, DebugPlugin, DecoderSimplePlugin, FpuPlugin, FullBarrelShifterPlugin, HazardSimplePlugin, IBusCachedPlugin, IBusSimplePlugin, IntAluPlugin, MmuPlugin, MmuPortConfig, MulDivIterativePlugin, MulPlugin, RegFilePlugin, STATIC, SrcPlugin, StaticMemoryTranslatorPlugin, YamlPlugin} import vexriscv.{Riscv, VexRiscv, VexRiscvBmbGenerator, VexRiscvConfig, plugin} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import spinal.lib.generator._ +import vexriscv.ip.fpu.FpuParameter case class VexRiscvSmpClusterParameter(cpuConfigs : Seq[VexRiscvConfig], withExclusiveAndInvalidation : Boolean, forcePeripheralWidth : Boolean = true, outOfOrderDecoder : Boolean = true) @@ -163,10 +164,15 @@ object VexRiscvSmpClusterGen { earlyBranch : Boolean = false, dBusCmdMasterPipe : Boolean = false, withMmu : Boolean = true, - withSupervisor : Boolean = true + withSupervisor : Boolean = true, + withFloat : Boolean = false, + withDouble : Boolean = false, + externalFpu : Boolean = true ) = { assert(iCacheSize/iCacheWays <= 4096, "Instruction cache ways can't be bigger than 4096 bytes") assert(dCacheSize/dCacheWays <= 4096, "Data cache ways can't be bigger than 4096 bytes") + assert(!(withDouble && !withFloat)) + val config = VexRiscvConfig( plugins = List( if(withMmu)new MmuPlugin( @@ -262,7 +268,7 @@ object VexRiscvSmpClusterGen { mulUnrollFactor = 32, divUnrollFactor = 1 ), - new CsrPlugin(CsrPluginConfig.openSbi(mhartid = hartId, misa = Riscv.misaToInt("imas")).copy(utimeAccess = CsrAccess.READ_ONLY)), + new CsrPlugin(CsrPluginConfig.openSbi(mhartid = hartId, misa = Riscv.misaToInt(s"ima${if(withFloat) "f" else ""}${if(withDouble) "d" else ""}s")).copy(utimeAccess = CsrAccess.READ_ONLY)), new BranchPlugin( earlyBranch = earlyBranch, catchAddressMisaligned = true, @@ -271,6 +277,11 @@ object VexRiscvSmpClusterGen { new YamlPlugin(s"cpu$hartId.yaml") ) ) + + if(withFloat) config.plugins += new FpuPlugin( + externalFpu = true, + p = FpuParameter(withDouble = withDouble) + ) config } diff --git a/src/main/scala/vexriscv/ip/DataCache.scala b/src/main/scala/vexriscv/ip/DataCache.scala index 4fbcdeb..f12250b 100644 --- a/src/main/scala/vexriscv/ip/DataCache.scala +++ b/src/main/scala/vexriscv/ip/DataCache.scala @@ -675,7 +675,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam val rspSync = True val rspLast = True - val memCmdSent = RegInit(False) setWhen (io.mem.cmd.ready) clearWhen (!io.cpu.writeBack.isStuck) + val memCmdSent = RegInit(False) setWhen (io.mem.cmd.fire) clearWhen (!io.cpu.writeBack.isStuck) val pending = withExclusive generate new Area{ val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0) val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - ((io.mem.rsp.valid && io.mem.rsp.last) ? (io.mem.rsp.aggregated +^ 1) | 0) diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index dca4f19..4a2e208 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -207,7 +207,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ arbiterOutput.source := arbiter.io.chosen arbiterOutput.payload.assignSomeByName(arbiter.io.output.payload) - val s0 = arbiterOutput.pipelined(m2s = true, s2m = true) + val s0 = arbiterOutput.pipelined(m2s = true, s2m = true) //TODO may need to remove m2s for store latency val useRs1, useRs2, useRs3, useRd = False switch(s0.opcode){ is(p.Opcode.LOAD) { useRd := True } @@ -287,28 +287,28 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val decode = new Area{ - val input = read.output.combStage() + val input = read.output/*.s2mPipe()*/.combStage() input.ready := False val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR val load = Stream(LoadInput()) load.valid := input.valid && loadHit input.ready setWhen(loadHit && load.ready) - load.payload.assignSomeByName(read.output.payload) + load.payload.assignSomeByName(input.payload) load.i2f := input.opcode === FpuOpcode.I2F val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS, FpuOpcode.FCVT_X_X).map(input.opcode === _).orR val shortPip = Stream(ShortPipInput()) input.ready setWhen(shortPipHit && shortPip.ready) shortPip.valid := input.valid && shortPipHit - shortPip.payload.assignSomeByName(read.output.payload) + shortPip.payload.assignSomeByName(input.payload) val divSqrtHit = input.opcode === p.Opcode.DIV || input.opcode === p.Opcode.SQRT val divSqrt = Stream(DivSqrtInput()) if(p.withDivSqrt) { input.ready setWhen (divSqrtHit && divSqrt.ready) divSqrt.valid := input.valid && divSqrtHit - divSqrt.payload.assignSomeByName(read.output.payload) + divSqrt.payload.assignSomeByName(input.payload) divSqrt.div := input.opcode === p.Opcode.DIV } @@ -324,15 +324,15 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ divSqrtToMul.ready := mul.ready mul.payload := divSqrtToMul.payload when(!divSqrtToMul.valid) { - mul.payload.assignSomeByName(read.output.payload) + mul.payload.assignSomeByName(input.payload) mul.add := fmaHit mul.divSqrt := False mul.msb1 := True mul.msb2 := True mul.rs2.sign.allowOverride(); - mul.rs2.sign := read.output.rs2.sign ^ input.arg(0) + mul.rs2.sign := input.rs2.sign ^ input.arg(0) mul.rs3.sign.allowOverride(); - mul.rs3.sign := read.output.rs3.sign ^ input.arg(1) + mul.rs3.sign := input.rs3.sign ^ input.arg(1) } } @@ -348,9 +348,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ mulToAdd.ready := add.ready add.payload := mulToAdd.payload when(!mulToAdd.valid) { - add.payload.assignSomeByName(read.output.payload) + add.payload.assignSomeByName(input.payload) add.rs2.sign.allowOverride; - add.rs2.sign := read.output.rs2.sign ^ input.arg(0) + add.rs2.sign := input.rs2.sign ^ input.arg(0) } } } @@ -578,7 +578,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val input = UInt(p.internalMantissaSize+1 max 33 bits).assignDontCare() var logic = input val scrap = Reg(Bool) - for(i <- by.range){ + for(i <- by.range.reverse){ scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0) logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic } @@ -809,11 +809,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ List(FpuOpcode.CMP, FpuOpcode.MIN_MAX).map(input.opcode === _).orR && rs2NanNv flag.NV setWhen(input.valid && nv) - input.ready := !halt && (toFpuRf ? rfOutput.ready | io.port.map(_.rsp.ready).read(input.source)) + val rspStreams = Vec(Stream(FpuRsp(p)), portCount) + input.ready := !halt && (toFpuRf ? rfOutput.ready | rspStreams.map(_.ready).read(input.source)) for(i <- 0 until portCount){ - def rsp = io.port(i).rsp + def rsp = rspStreams(i) rsp.valid := input.valid && input.source === i && !toFpuRf && !halt rsp.value := result + io.port(i).rsp << rsp.stage() completion(i).increments += (RegNext(rsp.fire) init(False)) } } @@ -940,7 +942,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val divSqrt = p.withDivSqrt generate new Area { - val input = decode.divSqrt.stage() + val input = decode.divSqrt.halfPipe() val aproxWidth = 8 val aproxDepth = 64 @@ -1142,7 +1144,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity - val shiftBy = rs1ExponentBigger ? (0-exp21) | exp21 + val shiftBy = exp21.asSInt.abs//rs1ExponentBigger ? (0-exp21) | exp21 val shiftOverflow = (shiftBy >= p.internalMantissaSize+3) val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero) @@ -1153,8 +1155,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) @@ U"00" val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) @@ U"00" var yMantissa = CombInit(yMantissaUnshifted) - val roundingScrap = CombInit(shiftOverflow) - for(i <- 0 until log2Up(p.internalMantissaSize)){ + val roundingScrap = False + for(i <- log2Up(p.internalMantissaSize) - 1 downto 0){ roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0) yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa } @@ -1181,6 +1183,25 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ? val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits) + + } + + class OhOutput extends MathOutput{ +// val shiftOh = Vec(Bool, p.internalMantissaSize+4) + val shift = UInt(log2Up(p.internalMantissaSize+4) bits) + } + + val oh = new Area { + val input = math.output.stage() + val output = input.swapPayload(new OhOutput) + output.payload.assignSomeByName(input.payload) + import input.payload._ + + val shiftOh = OHMasking.first(output.xyMantissa.asBools.reverse) //The OhMasking.first can be processed in parallel to the xyMantissa carry chaine +// output.shiftOh := shiftOh + + val shift = OHToUInt(shiftOh) + output.shift := shift } @@ -1193,13 +1214,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val norm = new Area{ - val input = math.output.stage() + val input = oh.output.stage() val output = input.swapPayload(new NormOutput) output.payload.assignSomeByName(input.payload) import input.payload._ - val shiftOh = OHMasking.first(xyMantissa.asBools.reverse) - val shift = OHToUInt(shiftOh) output.mantissa := (xyMantissa |<< shift) output.exponent := xyExponent -^ shift + 1 output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity) @@ -1210,7 +1229,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } val result = new Area { - val input = norm.output.stage() + val input = norm.output.pipelined() val output = input.swapPayload(new MergeInput()) import input.payload._ @@ -1251,7 +1270,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ inputs += load.s1.output.stage() if(p.withAdd) (inputs += add.result.output) if(p.withMul) (inputs += mul.result.output) - if(p.withShortPipMisc) (inputs += shortPip.rfOutput) + if(p.withShortPipMisc) (inputs += shortPip.rfOutput.pipelined(m2s = true)) val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs) val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId) val commited = arbitrated.haltWhen(!isCommited).toFlow @@ -1301,7 +1320,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val mantissaRange = p.internalMantissaSize downto 1 val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue) val adderRightOp = (mantissaIncrement ? (exactMask >> 1)| U(0)).resize(p.internalMantissaSize bits) - val adder = (input.value.exponent @@ adderMantissa) + adderRightOp + U(mantissaIncrement) + val adder = KeepAttribute(KeepAttribute(input.value.exponent @@ adderMantissa) + KeepAttribute(adderRightOp) + KeepAttribute(U(mantissaIncrement))) math.special := input.value.special math.sign := input.value.sign math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits) diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala index 8f968fe..3f6542c 100644 --- a/src/main/scala/vexriscv/ip/fpu/Interface.scala +++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -138,6 +138,12 @@ case class FpuFlags() extends Bundle{ case class FpuCompletion() extends Bundle{ val flag = FpuFlags() val count = UInt(2 bits) + + def stage() = { + val ret = FpuCompletion().setCompositeName(this, "stage", true) + ret := this + ret + } } case class FpuCmd(p : FpuParameter) extends Bundle{ diff --git a/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala b/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala index df50137..9c939bf 100644 --- a/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala +++ b/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala @@ -72,6 +72,9 @@ class DBusCachedPlugin(val config : DataCacheConfig, MEMORY_WR -> False ) ++ (if(catchSomething) List(HAS_SIDE_EFFECT -> True) else Nil) ) + + if(withLrSc) decoderService.add(key, Seq(MEMORY_LRSC -> False)) + if(withAmo) decoderService.add(key, Seq(MEMORY_AMO -> False)) } override def addStoreWordEncoding(key : MaskedLiteral): Unit = { val decoderService = pipeline.service(classOf[DecoderService]) @@ -91,6 +94,9 @@ class DBusCachedPlugin(val config : DataCacheConfig, MEMORY_WR -> True ) ++ (if(catchSomething) List(HAS_SIDE_EFFECT -> True) else Nil) ) + + if(withLrSc) decoderService.add(key, Seq(MEMORY_LRSC -> False)) + if(withAmo) decoderService.add(key, Seq(MEMORY_AMO -> False)) } val bypassStoreList = ArrayBuffer[(Bool, Bits)]() @@ -501,6 +507,7 @@ class DBusCachedPlugin(val config : DataCacheConfig, dBusAccess.rsp.error := cache.io.cpu.writeBack.unalignedAccess || cache.io.cpu.writeBack.accessError dBusAccess.rsp.redo := cache.io.cpu.redo component.addPrePopTask{() => + managementStage.input(IS_DBUS_SHARING).getDrivingReg clearWhen(dBusAccess.rsp.fire) when(forceDatapath){ execute.output(REGFILE_WRITE_DATA) := dBusAccess.cmd.address.asBits } diff --git a/src/main/scala/vexriscv/plugin/FpuPlugin.scala b/src/main/scala/vexriscv/plugin/FpuPlugin.scala index 88f3481..b42fd17 100644 --- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala +++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala @@ -157,13 +157,13 @@ class FpuPlugin(externalFpu : Boolean = false, import pipeline.config._ import Riscv._ - val internal = !externalFpu generate pipeline plug new Area{ + val internal = (!externalFpu).generate (pipeline plug new Area{ val fpu = FpuCore(1, p) fpu.io.port(0).cmd << port.cmd fpu.io.port(0).commit << port.commit fpu.io.port(0).rsp >> port.rsp fpu.io.port(0).completion <> port.completion - } + }) val csr = pipeline plug new Area{ @@ -195,6 +195,7 @@ class FpuPlugin(externalFpu : Boolean = false, fs := 3 //DIRTY } service.rw(CSR.SSTATUS, 13, fs) + service.rw(CSR.MSTATUS, 13, fs) } decode plug new Area{ @@ -259,7 +260,7 @@ class FpuPlugin(externalFpu : Boolean = false, commit.write := arbitration.isValid && !arbitration.removeIt commit.sync := input(FPU_COMMIT_SYNC) - when(arbitration.isValid && !commit.ready){ + when(isCommit && !commit.ready){ arbitration.haltByOther := True }