From 8761d0d9eed9e3550e3fb7446812038941fce89b Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Wed, 13 Jan 2021 18:28:26 +0100 Subject: [PATCH] FpuCore can add/mul/fma/store/load --- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 387 ++++++++++++++++++ .../scala/vexriscv/ip/fpu/Interface.scala | 77 ++++ src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 245 +++++++++++ .../scala/vexriscv/ip/fpu/Playground.scala | 39 ++ 4 files changed, 748 insertions(+) create mode 100644 src/main/scala/vexriscv/ip/fpu/FpuCore.scala create mode 100644 src/main/scala/vexriscv/ip/fpu/Interface.scala create mode 100644 src/test/scala/vexriscv/ip/fpu/FpuTest.scala create mode 100644 src/test/scala/vexriscv/ip/fpu/Playground.scala diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala new file mode 100644 index 0000000..b93dd0f --- /dev/null +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -0,0 +1,387 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets} + +import scala.collection.mutable.ArrayBuffer + +case class FpuCore(p : FpuParameter) extends Component{ + val io = new Bundle { + val port = slave(FpuPort(p)) + } + + val rfLockCount = 5 + val lockIdType = HardType(UInt(log2Up(rfLockCount) bits)) + + io.port.rsp.valid := False + io.port.rsp.payload.assignDontCare() + + case class RfReadInput() extends Bundle{ + val source = p.source() + val opcode = p.Opcode() + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + } + + case class RfReadOutput() extends Bundle{ + val source = p.source() + val opcode = p.Opcode() + val lockId = lockIdType() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + } + + + case class LoadInput() extends Bundle{ + val source = p.source() + val rs1 = p.internalFloating() + val rd = p.rfAddress() + val lockId = lockIdType() + } + + case class StoreInput() extends Bundle{ + val source = p.source() + val rs2 = p.internalFloating() + } + + case class MulInput() extends Bundle{ + val source = p.source() + val rs1, rs2, rs3 = p.internalFloating() + val rd = p.rfAddress() + val lockId = lockIdType() + val add = Bool() + val minus = Bool() + } + + case class AddInput() extends Bundle{ + val source = p.source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val lockId = lockIdType() + } + + case class WriteInput() extends Bundle{ + val source = p.source() + val lockId = lockIdType() + val rd = p.rfAddress() + val value = p.internalFloating() + } + + + val rf = new Area{ + val ram = Mem(p.internalFloating, 32*(1 << p.sourceWidth)) + val lock = for(i <- 0 until rfLockCount) yield new Area{ + val valid = RegInit(False) + val source = Reg(p.source) + val address = Reg(p.rfAddress) + } + val lockFree = !lock.map(_.valid).andR + val lockFreeId = OHMasking.first(lock.map(!_.valid)) + } + + val read = new Area{ + val s0 = Stream(RfReadInput()) + s0.arbitrationFrom(io.port.cmd) + s0.payload.assignSomeByName(io.port.cmd.payload) + + val useRs1, useRs2, useRs3, useRd = False + switch(s0.opcode){ + is(p.Opcode.LOAD){ + useRd := True + } + is(p.Opcode.STORE){ + useRs2 := True + } + is(p.Opcode.ADD){ + useRs1 := True + useRs2 := True + useRd := True + } + is(p.Opcode.MUL){ + useRs1 := True + useRs2 := True + useRd := True + } + is(p.Opcode.FMA){ + useRs1 := True + useRs2 := True + useRs3 := True //Can be delayed to have less hazard + useRd := True + } + } + + val hits = List((useRs1, s0.rs1), (useRs2, s0.rs2), (useRs3, s0.rs3), (useRd, s0.rd)).map{case (use, reg) => use && rf.lock.map(l => l.valid && l.source === s0.source && l.address === reg).orR} + val hazard = hits.orR + when(s0.fire && useRd){ + for(i <- 0 until rfLockCount){ + when(rf.lockFreeId(i)){ + rf.lock(i).valid := True + rf.lock(i).source := s0.source + rf.lock(i).address := s0.rd + } + } + } + + val s1 = s0.haltWhen(hazard || !rf.lockFree).m2sPipe() + val output = s1.swapPayload(RfReadOutput()) + val s1LockId = RegNextWhen(OHToUInt(rf.lockFreeId), !output.isStall) + output.source := s1.source + output.opcode := s1.opcode + output.lockId := s1LockId + output.rd := s1.rd + output.rs1 := rf.ram.readSync(s0.rs1,enable = !output.isStall) + output.rs2 := rf.ram.readSync(s0.rs2,enable = !output.isStall) + output.rs3 := rf.ram.readSync(s0.rs3,enable = !output.isStall) + } + + val decode = new Area{ + val input = read.output.combStage() + input.ready := False + + val loadHit = input.opcode === p.Opcode.LOAD + val load = Stream(LoadInput()) + load.valid := input.valid && loadHit + input.ready setWhen(loadHit && load.ready) + load.source := read.output.source + load.rd := read.output.rd + load.rs1 := read.output.rs1 + load.lockId := read.output.lockId + + val storeHit = input.opcode === p.Opcode.STORE + val store = Stream(StoreInput()) + input.ready setWhen(storeHit && store.ready) + store.valid := input.valid && storeHit + store.source := read.output.source + store.rs2 := read.output.rs2 + + + val fmaHit = input.opcode === p.Opcode.FMA + val mulHit = input.opcode === p.Opcode.MUL || fmaHit + val mul = Stream(MulInput()) + input.ready setWhen(mulHit && mul.ready) + mul.valid := input.valid && mulHit + mul.source := read.output.source + mul.rs1 := read.output.rs1 + mul.rs2 := read.output.rs2 + mul.rs3 := read.output.rs3 + mul.rd := read.output.rd + mul.lockId := read.output.lockId + mul.add := fmaHit + mul.minus := False //TODO + + val addHit = input.opcode === p.Opcode.ADD + val add = Stream(AddInput()) + val mulToAdd = Stream(AddInput()) + + input.ready setWhen(addHit && add.ready && !mulToAdd.valid) + add.valid := input.valid && addHit || mulToAdd.valid + + + mulToAdd.ready := add.ready + add.payload := mulToAdd.payload + when(!mulToAdd.valid) { + add.payload.assignSomeByName(read.output.payload) + } + + + } + + val load = new Area{ + def input = decode.load + + val output = input.stage() + } + + + val store = new Area{ + val input = decode.store.stage() + + input.ready := io.port.rsp.ready + when(input.valid){ + io.port.rsp.valid := True + io.port.rsp.source := input.source + io.port.rsp.value := input.rs2.asBits + } + } + + val mul = new Area{ + val input = decode.mul.stage() + + val math = new Area { + val mulA = U"1" @@ input.rs1.mantissa + val mulB = U"1" @@ input.rs2.mantissa + val mulC = mulA * mulB + val exp = input.rs1.exponent +^ input.rs2.exponent - ((1 << p.internalExponentSize - 1) - 1) + } + + val norm = new Area{ + val needShift = math.mulC.msb + val exp = math.exp + U(needShift) + val man = needShift ? math.mulC(p.internalMantissaSize + 1, p.internalMantissaSize bits) | math.mulC(p.internalMantissaSize, p.internalMantissaSize bits) + + val output = FpuFloat(p.internalExponentSize, p.internalMantissaSize) + output.sign := input.rs1.sign ^ input.rs2.sign + output.exponent := exp.resized + output.mantissa := man + } + + val output = Stream(WriteInput()) + output.valid := input.valid && !input.add + output.source := input.source + output.lockId := input.lockId + output.rd := input.rd + output.value := norm.output + + decode.mulToAdd.valid := input.valid && input.add + decode.mulToAdd.source := input.source + decode.mulToAdd.rs1.mantissa := norm.output.mantissa + decode.mulToAdd.rs1.exponent := norm.output.exponent + decode.mulToAdd.rs1.sign := norm.output.sign ^ input.minus + decode.mulToAdd.rs2 := input.rs3 + decode.mulToAdd.rd := input.rd + decode.mulToAdd.lockId := input.lockId + + input.ready := (input.add ? decode.mulToAdd.ready | output.ready) + } + + val add = new Area{ + val input = decode.add.stage() + + val shifter = new Area { + val exp21 = input.rs2.exponent - input.rs1.exponent + val rs1ExponentBigger = exp21.msb + val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent + val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa + val absRs1Bigger = rs1ExponentBigger|| rs1ExponentEqual && rs1MantissaBigger + val shiftBy = rs1ExponentBigger ? (0-exp21) | exp21 + + //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path + val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign + val xSign = xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign) + val ySign = xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign) + val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) + val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) + val yMantissa = yMantissaUnshifted >> shiftBy + val xyExponent = rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent + } + + val math = new Area { + def xSign = shifter.xSign + def ySign = shifter.ySign + def xMantissa = shifter.xMantissa + def yMantissa = shifter.yMantissa + def xyExponent = shifter.xyExponent + def xySign = shifter.xySign + + val xSigned = xMantissa.twoComplement(xSign) + val ySigned = yMantissa.twoComplement(ySign) + val xyMantissa = U(xSigned +^ ySigned).trim(1 bits) + } + + val norm = new Area{ + def xyExponent = math.xyExponent + def xyMantissa = math.xyMantissa + def xySign = math.xySign + + val shiftOh = OHMasking.first(xyMantissa.asBools.reverse) + val shift = OHToUInt(shiftOh) + val mantissa = (xyMantissa |<< shift) >> 1 + val exponent = xyExponent - shift + 1 + } + + + val output = input.swapPayload(WriteInput()) + output.source := input.source + output.lockId := input.lockId + output.rd := input.rd + output.value.sign := norm.xySign + output.value.mantissa := norm.mantissa.resized + output.value.exponent := norm.exponent + } + + + val write = new Area{ + val port = rf.ram.writePort + port.valid := False + port.payload.assignDontCare() + + + val lockFree = Flow(lockIdType) + lockFree.valid := port.fire + lockFree.payload.assignDontCare() + + load.output.ready := False + mul.output.ready := False + add.output.ready := True + io.port.commit.ready := False + when(add.output.valid) { + port.valid := True + port.address := add.output.source @@ add.output.rd + port.data := add.output.value + + lockFree.payload := add.output.lockId + } elsewhen(mul.output.valid) { + port.valid := True + port.address := mul.output.source @@ mul.output.rd + port.data := mul.output.value + + mul.output.ready := True + lockFree.payload := mul.output.lockId + } elsewhen(load.output.valid && io.port.commit.valid) { + port.valid := io.port.commit.write + port.address := load.output.source @@ load.output.rd + port.data.assignFromBits(io.port.commit.value) + + load.output.ready := True + io.port.commit.ready := True + lockFree.payload := load.output.lockId + } + + when(lockFree.fire){ + for(i <- 0 until rfLockCount) when(lockFree.payload === i){ + rf.lock(i).valid := False + } + } + } +} + + + + +object StreamFifoMultiChannelBench extends App{ + val payloadType = HardType(Bits(8 bits)) + class Fpu(name : String, p : FpuParameter) extends Rtl{ + override def getName(): String = "Fpu_" + name + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuCore(p){ + + setDefinitionName(Fpu.this.getName()) + }) + } + + + + val rtls = ArrayBuffer[Fpu]() + rtls += new Fpu( + "32", + FpuParameter( + internalMantissaSize = 23, + withDouble = false, + sourceWidth = 0 + ) + ) + rtls += new Fpu( + "64", + FpuParameter( + internalMantissaSize = 52, + withDouble = true, + sourceWidth = 0 + ) + ) + + val targets = XilinxStdTargets()// ++ AlteraStdTargets() + + + Bench(rtls, targets) +} \ No newline at end of file diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala new file mode 100644 index 0000000..a996390 --- /dev/null +++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -0,0 +1,77 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ + + +object Fpu{ + + object Function{ + val MUL = 0 + val ADD = 1 + } + +} + + + +case class FpuFloat(exponentSize: Int, + mantissaSize: Int) extends Bundle { + val mantissa = UInt(mantissaSize bits) + val exponent = UInt(exponentSize bits) + val sign = Bool +} + +case class FpuOpcode(p : FpuParameter) extends SpinalEnum{ + val LOAD, STORE, MUL, ADD, FMA, I2F, F2I, CMP = newElement() +} + +case class FpuParameter( internalMantissaSize : Int, + withDouble : Boolean, + sourceWidth : Int){ + + val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits)) + val internalExponentSize = if(withDouble) 11 else 8 + val internalFloating = HardType(FpuFloat(exponentSize = internalExponentSize, mantissaSize = internalMantissaSize)) +// val opcode = HardType(UInt(2 bits)) + val source = HardType(UInt(sourceWidth bits)) + val rfAddress = HardType(UInt(5 bits)) + + val Opcode = new FpuOpcode(this) + val Format = new SpinalEnum{ + val FLOAT = newElement() + val DOUBLE = withDouble generate newElement() + } +} + +case class FpuCmd(p : FpuParameter) extends Bundle{ + val source = UInt(p.sourceWidth bits) + val opcode = p.Opcode() + val value = Bits(32 bits) // Int to float + val function = Bits(3 bits) // Int to float + val rs1, rs2, rs3 = p.rfAddress() + val rd = p.rfAddress() + val format = p.Format() +} + +case class FpuCommit(p : FpuParameter) extends Bundle{ + val source = UInt(p.sourceWidth bits) + val write = Bool() + val value = p.storeLoadType() // IEEE 754 load +} + +case class FpuRsp(p : FpuParameter) extends Bundle{ + val source = UInt(p.sourceWidth bits) + val value = p.storeLoadType() // IEEE754 store || Integer +} + +case class FpuPort(p : FpuParameter) extends Bundle with IMasterSlave { + val cmd = Stream(FpuCmd(p)) + val commit = Stream(FpuCommit(p)) + val rsp = Stream(FpuRsp(p)) + + override def asMaster(): Unit = { + master(cmd, commit) + slave(rsp) + } +} diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala new file mode 100644 index 0000000..a2cd6d1 --- /dev/null +++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala @@ -0,0 +1,245 @@ +package vexriscv.ip.fpu + +import java.lang + +import org.scalatest.FunSuite +import spinal.core.SpinalEnumElement +import spinal.core.sim._ +import spinal.lib.experimental.math.Floating +import spinal.lib.sim._ + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.util.Random + +class FpuTest extends FunSuite{ + + + test("directed"){ + val p = FpuParameter( + internalMantissaSize = 23, + withDouble = false, + sourceWidth = 0 + ) + + SimConfig.withFstWave.compile(new FpuCore(p)).doSim(seed = 42){ dut => + dut.clockDomain.forkStimulus(10) + + + + + val cpus = for(id <- 0 until 1 << p.sourceWidth) yield new { + val cmdQueue = mutable.Queue[FpuCmd => Unit]() + val commitQueue = mutable.Queue[FpuCommit => Unit]() + val rspQueue = mutable.Queue[FpuRsp => Unit]() + + def loadRaw(rd : Int, value : BigInt): Unit ={ + cmdQueue += {cmd => + cmd.source #= id + cmd.opcode #= cmd.opcode.spinalEnum.LOAD + cmd.value.randomize() + cmd.rs1.randomize() + cmd.rs2.randomize() + cmd.rs3.randomize() + cmd.rd #= rd + } + commitQueue += {cmd => + cmd.source #= id + cmd.write #= true + cmd.value #= value + } + } + + def load(rd : Int, value : Float): Unit ={ + loadRaw(rd, lang.Float.floatToIntBits(value).toLong & 0xFFFFFFFFl) + } + + def storeRaw(rs : Int)(body : FpuRsp => Unit): Unit ={ + cmdQueue += {cmd => + cmd.source #= id + cmd.opcode #= cmd.opcode.spinalEnum.STORE + cmd.value.randomize() + cmd.rs1.randomize() + cmd.rs2 #= rs + cmd.rs3.randomize() + cmd.rd.randomize() + } + + rspQueue += body + } + + def storeFloat(rs : Int)(body : Float => Unit): Unit ={ + storeRaw(rs){rsp => body(lang.Float.intBitsToFloat(rsp.value.toLong.toInt))} + } + + def mul(rd : Int, rs1 : Int, rs2 : Int): Unit ={ + cmdQueue += {cmd => + cmd.source #= id + cmd.opcode #= cmd.opcode.spinalEnum.MUL + cmd.value.randomize() + cmd.rs1 #= rs1 + cmd.rs2 #= rs2 + cmd.rs3.randomize() + cmd.rd #= rd + } + } + + def add(rd : Int, rs1 : Int, rs2 : Int): Unit ={ + cmdQueue += {cmd => + cmd.source #= id + cmd.opcode #= cmd.opcode.spinalEnum.ADD + cmd.value.randomize() + cmd.rs1 #= rs1 + cmd.rs2 #= rs2 + cmd.rs3.randomize() + cmd.rd #= rd + } + } + + def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int): Unit ={ + cmdQueue += {cmd => + cmd.source #= id + cmd.opcode #= cmd.opcode.spinalEnum.FMA + cmd.value.randomize() + cmd.rs1 #= rs1 + cmd.rs2 #= rs2 + cmd.rs3 #= rs3 + cmd.rd #= rd + } + } + } + + StreamDriver(dut.io.port.cmd ,dut.clockDomain){payload => + cpus.map(_.cmdQueue).filter(_.nonEmpty).toSeq match { + case Nil => false + case l => { + l.randomPick().dequeue().apply(payload) + true + } + } + } + + StreamDriver(dut.io.port.commit ,dut.clockDomain){payload => + cpus.map(_.commitQueue).filter(_.nonEmpty).toSeq match { + case Nil => false + case l => { + l.randomPick().dequeue().apply(payload) + true + } + } + } + + + StreamMonitor(dut.io.port.rsp, dut.clockDomain){payload => + cpus(payload.source.toInt).rspQueue.dequeue().apply(payload) + } + + StreamReadyRandomizer(dut.io.port.rsp, dut.clockDomain) + + + + + + val stim = for(cpu <- cpus) yield fork { + import cpu._ + + class RegAllocator(){ + var value = 0 + + def allocate(): Int ={ + while(true){ + val rand = Random.nextInt(32) + val mask = 1 << rand + if((value & mask) == 0) { + value |= mask + return rand + } + } + 0 + } + } + def checkFloat(ref : Float, dut : Float): Boolean ={ + ref.abs * 1.0001 > dut.abs && ref.abs * 0.9999 < dut.abs && ref.signum == dut.signum + } + + def randomFloat(): Float ={ + Random.nextFloat() * 1e2f * (if(Random.nextBoolean()) -1f else 1f) + } + + def testAdd(a : Float, b : Float): Unit ={ + val rs = new RegAllocator() + val rs1, rs2, rs3 = rs.allocate() + val rd = Random.nextInt(32) + load(rs1, a) + load(rs2, b) + + add(rd,rs1,rs2) + storeFloat(rd){v => + val ref = a+b + println(f"$a + $b = $v, $ref") + assert(checkFloat(ref, v)) + } + } + + def testMul(a : Float, b : Float): Unit ={ + val rs = new RegAllocator() + val rs1, rs2, rs3 = rs.allocate() + val rd = Random.nextInt(32) + load(rs1, a) + load(rs2, b) + + mul(rd,rs1,rs2) + storeFloat(rd){v => + val ref = a*b + println(f"$a * $b = $v, $ref") + assert(checkFloat(ref, v)) + } + } + + + def testFma(a : Float, b : Float, c : Float): Unit ={ + val rs = new RegAllocator() + val rs1, rs2, rs3 = rs.allocate() + val rd = Random.nextInt(32) + load(rs1, a) + load(rs2, b) + load(rs3, c) + + fma(rd,rs1,rs2,rs3) + storeFloat(rd){v => + val ref = a * b + c + println(f"$a * $b + $c = $v, $ref") + assert(checkFloat(ref, v)) + } + } + +// testAdd(0.1f, 1.6f) +// testMul(0.1f, 1.6f) + testFma(1.1f, 2.2f, 3.0f) + + for(i <- 0 until 1000){ + testAdd(randomFloat(), randomFloat()) + } + for(i <- 0 until 1000){ + testMul(randomFloat(), randomFloat()) + } + for(i <- 0 until 1000){ + testFma(randomFloat(), randomFloat(), randomFloat()) + } + for(i <- 0 until 1000){ + val tests = ArrayBuffer[() => Unit]() + tests += (() =>{testAdd(randomFloat(), randomFloat())}) + tests += (() =>{testMul(randomFloat(), randomFloat())}) + tests += (() =>{testFma(randomFloat(), randomFloat(), randomFloat())}) + tests.randomPick().apply() + } + + waitUntil(cpu.rspQueue.isEmpty) + } + + + stim.foreach(_.join()) + dut.clockDomain.waitSampling(100) + } + } +} diff --git a/src/test/scala/vexriscv/ip/fpu/Playground.scala b/src/test/scala/vexriscv/ip/fpu/Playground.scala new file mode 100644 index 0000000..2fb99ad --- /dev/null +++ b/src/test/scala/vexriscv/ip/fpu/Playground.scala @@ -0,0 +1,39 @@ +package vexriscv.ip.fpu + +object MiaouDiv extends App{ + val input = 2.5 + var output = 1/(input*0.95) +// def x = output +// def y = input + + def y = output + def x = input + + for(i <- 0 until 10) { + output = 2 * y - x * y * y + println(output) + } + + + //output = x*output + println(1/input) +} + +object MiaouSqrt extends App{ + val input = 2.0 + var output = 1/Math.sqrt(input*0.95) + // def x = output + // def y = input + + def y = output + def x = input + + for(i <- 0 until 10) { + output = y*(1.5-x*y*y/2) + println(output) + } + + output = x*output + println(output) + println(s"ref ${Math.sqrt(input)}") +}