From de09ed3fcb3d221bce97afec5267586efb15f011 Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Thu, 25 Feb 2021 15:28:38 +0100 Subject: [PATCH] fpu added exact div/sqrt implementations using iterative approaches --- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 260 +++++++++++++++++- src/main/scala/vexriscv/ip/fpu/FpuDiv.scala | 128 +++++++++ src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala | 116 ++++++++ .../scala/vexriscv/ip/fpu/Interface.scala | 7 +- src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 100 +++---- 5 files changed, 537 insertions(+), 74 deletions(-) create mode 100644 src/main/scala/vexriscv/ip/fpu/FpuDiv.scala create mode 100644 src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index 21b2ad7..daf7877 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -3,6 +3,7 @@ package vexriscv.ip.fpu import spinal.core._ import spinal.lib._ import spinal.lib.eda.bench.{Bench, Rtl, XilinxStdTargets} +import spinal.lib.math.UnsignedDivider import scala.collection.mutable.ArrayBuffer @@ -24,8 +25,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val exponentF32Infinity = exponentOne+127+1 val exponentF64Infinity = exponentOne+1023+1 - val rfLockCount = 5 - val lockIdType = HardType(UInt(log2Up(rfLockCount) bits)) + + val lockIdType = HardType(UInt(log2Up(p.rfLockCount) bits)) def whenDouble(format : FpuFormat.C)(yes : => Unit)(no : => Unit): Unit ={ if(p.withDouble) when(format === FpuFormat.DOUBLE) { yes } otherwise{ no } @@ -106,6 +107,25 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val format = p.withDouble generate FpuFormat() } + case class DivInput() extends Bundle{ + val source = Source() + val rs1, rs2 = p.internalFloating() + val rd = p.rfAddress() + val lockId = lockIdType() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + + + case class SqrtInput() extends Bundle{ + val source = Source() + val rs1 = p.internalFloating() + val rd = p.rfAddress() + val lockId = lockIdType() + val roundMode = FpuRoundMode() + val format = p.withDouble generate FpuFormat() + } + case class AddInput() extends Bundle{ val source = Source() @@ -145,11 +165,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val boxed = p.withDouble generate Bool() } val ram = Mem(Entry(), 32*portCount) - val lock = for(i <- 0 until rfLockCount) yield new Area{ + val lock = for(i <- 0 until p.rfLockCount) yield new Area{ val valid = RegInit(False) val source = Reg(Source()) val address = Reg(p.rfAddress) - val id = Reg(UInt(log2Up(rfLockCount) bits)) + val id = Reg(UInt(log2Up(p.rfLockCount+1) bits)) val commited = Reg(Bool) val write = Reg(Bool) } @@ -184,7 +204,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val commitLogic = for(source <- 0 until portCount) yield new Area{ val fire = False - val target, hit = Reg(UInt(log2Up(rfLockCount+1) bits)) init(0) + val target, hit = Reg(UInt(log2Up(p.rfLockCount+1) bits)) init(0) val full = target + 1 === hit when(fire){ hit := hit + 1 @@ -241,7 +261,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ commitLogic(i).target := commitLogic(i).target + 1 } } - for(i <- 0 until rfLockCount){ + for(i <- 0 until p.rfLockCount){ when(rf.lockFreeId(i)){ rf.lock(i).valid := True rf.lock(i).source := s0.source @@ -317,10 +337,31 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ divSqrt.div := input.opcode === p.Opcode.DIV } + val divHit = input.opcode === p.Opcode.DIV + val div = Stream(DivInput()) + if(p.withDiv) { + input.ready setWhen (divHit && div.ready) + div.valid := input.valid && divHit + div.payload.assignSomeByName(input.payload) + } + + val sqrtHit = input.opcode === p.Opcode.SQRT + val sqrt = Stream(SqrtInput()) + if(p.withSqrt) { + input.ready setWhen (sqrtHit && sqrt.ready) + sqrt.valid := input.valid && sqrtHit + sqrt.payload.assignSomeByName(input.payload) + } + + val fmaHit = input.opcode === p.Opcode.FMA val mulHit = input.opcode === p.Opcode.MUL || fmaHit val mul = Stream(new MulInput()) val divSqrtToMul = Stream(new MulInput()) + if(!p.withDivSqrt){ + divSqrtToMul.valid := False + divSqrtToMul.payload.assignDontCare() + } if(p.withMul) { input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid) @@ -910,7 +951,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ // val flag = io.port(input.source).completion.flag when(forceNan) { output.setNanQuiet - NV setWhen(input.valid && (infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + NV setWhen(infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling) } elsewhen(forceOverflow) { output.setInfinity } elsewhen(forceZero) { @@ -958,6 +999,145 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } } + + val div = p.withDiv generate new Area{ + val input = decode.div.halfPipe() + val haltIt = True + val output = input.haltWhen(haltIt).swapPayload(new MergeInput()) + + val dividerShift = if(p.withDouble) 0 else 1 + val divider = FpuDiv(p.internalMantissaSize + dividerShift) + divider.io.input.a := input.rs1.mantissa << dividerShift + divider.io.input.b := input.rs2.mantissa << dividerShift + val dividerResult = divider.io.output.result >> dividerShift + val dividerScrap = divider.io.output.remain =/= 0 || divider.io.output.result(0, dividerShift bits) =/= 0 + + val cmdSent = RegInit(False) setWhen(divider.io.input.fire) clearWhen(!haltIt) + divider.io.input.valid := input.valid && !cmdSent + divider.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + val needShift = !dividerResult.msb + val mantissa = needShift ? dividerResult(0, p.internalMantissaSize + 1 bits) | dividerResult(1, p.internalMantissaSize + 1 bits) + val scrap = dividerScrap || !needShift && dividerResult(0) + val exponentOffset = 1 << (p.internalExponentSize + (if(p.withDouble) 0 else 1)) + val exponent = input.rs1.exponent + U(exponentOffset | exponentOne) - input.rs2.exponent - U(needShift) + + output.value.setNormal + output.value.sign := input.rs1.sign ^ input.rs2.sign + output.value.exponent := exponent.resized + output.value.mantissa := mantissa + output.scrap := scrap + if(!p.withDouble) when(exponent.takeHigh(2) === 3){ output.value.exponent(p.internalExponentSize-3, 3 bits) := 7} //Handle overflow + + + + val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) + val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) + val forceUnderflow = exponent < underflowThreshold + val forceOverflow = input.rs1.isInfinity || input.rs2.isZero + val infinitynan = input.rs1.isZero && input.rs2.isZero + val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan + val forceZero = input.rs1.isZero + + + + output.NV := False + output.DZ := !forceNan && input.rs2.isZero + + when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } + + when(forceNan) { + output.value.setNanQuiet + output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + } elsewhen(forceOverflow) { + output.value.setInfinity + } elsewhen(forceZero) { + output.value.setZero + } elsewhen(forceUnderflow) { + output.value.exponent := underflowExp.resized + } + + + haltIt clearWhen(divider.io.output.valid) + } + + + + val sqrt = p.withSqrt generate new Area{ + val input = decode.sqrt.halfPipe() + val haltIt = True + val output = input.haltWhen(haltIt).swapPayload(new MergeInput()) + + val needShift = !input.rs1.exponent.lsb + val sqrt = FpuSqrt(p.internalMantissaSize) + sqrt.io.input.a := (needShift ? (U"1" @@ input.rs1.mantissa @@ U"0") | (U"01" @@ input.rs1.mantissa)) + + val cmdSent = RegInit(False) setWhen(sqrt.io.input.fire) clearWhen(!haltIt) + sqrt.io.input.valid := input.valid && !cmdSent + sqrt.io.output.ready := input.ready + output.payload.assignSomeByName(input.payload) + + + val scrap = sqrt.io.output.remain =/= 0 + val exponent = RegNext(exponentOne-exponentOne/2 -1 +^ (input.rs1.exponent >> 1) + U(input.rs1.exponent.lsb)) + + output.value.setNormal + output.value.sign := input.rs1.sign + output.value.exponent := exponent + output.value.mantissa := sqrt.io.output.result + output.scrap := scrap + output.NV := False + output.DZ := False + + val negative = !input.rs1.isNan && !input.rs1.isZero && input.rs1.sign + + when(input.rs1.isInfinity){ + output.value.setInfinity + } + when(negative){ + output.value.setNanQuiet + output.NV := True + } + when(input.rs1.isNan){ + output.value.setNanQuiet + output.NV := !input.rs1.isQuiet + } + when(input.rs1.isZero){ + output.value.setZero + } + + +// val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 53) (exponentOne + exponentOffset - 127 - 24) +// val underflowExp = muxDouble[UInt](input.format)(exponentOne + exponentOffset - 1023 - 54) (exponentOne + exponentOffset - 127 - 25) +// val forceUnderflow = exponent < underflowThreshold +// val forceOverflow = input.rs1.isInfinity// || input.rs2.isInfinity +// val infinitynan = input.rs1.isZero && input.rs2.isZero +// val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan +// val forceZero = input.rs1.isZero +// +// +// +// output.NV := False +// output.DZ := !forceNan && input.rs2.isZero +// +// when(exponent(exponent.getWidth-3, 3 bits) === 7) { output.value.exponent(p.internalExponentSize-2, 2 bits) := 3 } +// +// when(forceNan) { +// output.value.setNanQuiet +// output.NV setWhen((infinitynan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) +// } elsewhen(forceOverflow) { +// output.value.setInfinity +// } elsewhen(forceZero) { +// output.value.setZero +// } elsewhen(forceUnderflow) { +// output.value.exponent := underflowExp.resized +// } + + + haltIt clearWhen(sqrt.io.output.valid) + } + val divSqrt = p.withDivSqrt generate new Area { val input = decode.divSqrt.halfPipe() @@ -1263,7 +1443,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ // val flag = io.port(input.source).completion.flag - output.NV := (input.valid && (infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling)) + output.NV := infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling output.DZ := False when(forceNan) { output.value.setNanQuiet @@ -1286,6 +1466,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ //TODO maybe load can bypass merge and round. val inputs = ArrayBuffer[Stream[MergeInput]]() inputs += load.s1.output.stage() + if(p.withSqrt) (inputs += sqrt.output) + if(p.withDiv) (inputs += div.output) if(p.withAdd) (inputs += add.result.output) if(p.withMul) (inputs += mul.result.output) if(p.withShortPipMisc) (inputs += shortPip.rfOutput.pipelined(m2s = true)) @@ -1422,7 +1604,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } when(input.valid){ - for(i <- 0 until rfLockCount) when(input.lockId === i){ + for(i <- 0 until p.rfLockCount) when(input.lockId === i){ rf.lock(i).valid := False } } @@ -1516,19 +1698,40 @@ object FpuSynthesisBench extends App{ SpinalVerilog(new Component{ val a = Delay(in UInt(width bits), 3) val sel = Delay(in UInt(log2Up(width) bits),3) -// val result = -// val output = Delay(result, 3) + // val result = + // val output = Delay(result, 3) setDefinitionName(Rotate3.this.getName()) }) } + class Div(width : Int) extends Rtl{ + override def getName(): String = "div_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new UnsignedDivider(width,width, false).setDefinitionName(Div.this.getName())) + } + class Add(width : Int) extends Rtl{ + override def getName(): String = "add_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new Component{ + val a, b = in UInt(width bits) + val result = out(a + b) + setDefinitionName(Add.this.getName()) + }) + } + + class DivSqrtRtl(width : Int) extends Rtl{ + override def getName(): String = "DivSqrt_" + width + override def getRtlPath(): String = getName() + ".v" + SpinalVerilog(new FpuDiv(width).setDefinitionName(DivSqrtRtl.this.getName())) + } val rtls = ArrayBuffer[Rtl]() rtls += new Fpu( "32", portCount = 1, FpuParameter( +// withDivSqrt = false, withDouble = false ) ) @@ -1536,11 +1739,18 @@ object FpuSynthesisBench extends App{ "64", portCount = 1, FpuParameter( +// withDivSqrt = false, withDouble = true ) ) -// rtls += new Shifter(24) +// rtls += new Div(52) +// rtls += new Div(23) +// rtls += new Add(64) +// rtls += new DivSqrtRtl(52) +// rtls += new DivSqrtRtl(23) + + // rtls += new Shifter(24) // rtls += new Shifter(32) // rtls += new Shifter(52) // rtls += new Shifter(64) @@ -1557,4 +1767,28 @@ object FpuSynthesisBench extends App{ Bench(rtls, targets) -} \ No newline at end of file +} + +//Fpu_32 -> +//Artix 7 -> 136 Mhz 1471 LUT 1336 FF +//Artix 7 -> 196 Mhz 1687 LUT 1371 FF +//Fpu_64 -> +//Artix 7 -> 105 Mhz 2822 LUT 2132 FF +//Artix 7 -> 161 Mhz 3114 LUT 2272 FF +// +// +// +//Fpu_32 -> +//Artix 7 -> 128 Mhz 1693 LUT 1481 FF +//Artix 7 -> 203 Mhz 1895 LUT 1481 FF +//Fpu_64 -> +//Artix 7 -> 99 Mhz 3073 LUT 2396 FF +//Artix 7 -> 164 Mhz 3433 LUT 2432 FF + + +//Fpu_32 -> +//Artix 7 -> 112 Mhz 1790 LUT 1666 FF +//Artix 7 -> 158 Mhz 1989 LUT 1701 FF +//Fpu_64 -> +//Artix 7 -> 100 Mhz 3294 LUT 2763 FF +//Artix 7 -> 151 Mhz 3708 LUT 2904 FF \ No newline at end of file diff --git a/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala b/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala new file mode 100644 index 0000000..9912e3f --- /dev/null +++ b/src/main/scala/vexriscv/ip/fpu/FpuDiv.scala @@ -0,0 +1,128 @@ +package vexriscv.ip.fpu + + +import spinal.core._ +import spinal.lib.math.{UnsignedDividerCmd, UnsignedDividerRsp} +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuDivCmd(mantissaWidth : Int) extends Bundle{ + val a,b = UInt(mantissaWidth bits) +} + +case class FpuDivRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 + 2 bits) + val remain = UInt(mantissaWidth+1 bits) +} + +case class FpuDiv(val mantissaWidth : Int) extends Component { + assert(mantissaWidth % 2 == 0) + val io = new Bundle{ + val input = slave Stream(FpuDivCmd(mantissaWidth)) + val output = master Stream(FpuDivRsp(mantissaWidth)) + } + + val iterations = (mantissaWidth+2+2)/2 + val counter = Reg(UInt(log2Up(iterations) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val shifter = Reg(UInt(mantissaWidth + 3 bits)) + val result = Reg(UInt(mantissaWidth+1+2 bits)) + + val div1, div3 = Reg(UInt(mantissaWidth+3 bits)) + val div2 = div1 |<< 1 + + val sub1 = shifter -^ div1 + val sub2 = shifter -^ div2 + val sub3 = shifter -^ div3 + + io.output.valid := done + io.output.result := (result << 0).resized + io.output.remain := (shifter >> 2).resized + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(shifter) + result := result |<< 2 + when(!sub1.msb){ + sel := sub1.resized + result(1 downto 0) := 1 + } + when(!sub2.msb){ + sel := sub2.resized + result(1 downto 0) := 2 + } + when(!sub3.msb){ + sel := sub3.resized + result(1 downto 0) := 3 + } + shifter := sel |<< 2 + } + + when(!busy){ + counter := 0 + shifter := (U"1" @@ io.input.a @@ U"").resized + div1 := (U"1" @@ io.input.b).resized + div3 := (U"1" @@ io.input.b) +^ (((U"1" @@ io.input.b)) << 1) + busy := io.input.valid + } +} + + +object FpuDivTester extends App{ + import spinal.core.sim._ + + for(w <- List(16, 20)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuDiv(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuDivRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int, b : Int): Unit ={ + cmdQueue +={p => + p.a #= a + p.b #= b + } + rspQueue += {p => + val x = (a | (1 << dut.mantissaWidth)).toLong + val y = (b | (1 << dut.mantissaWidth)).toLong + val result = (x << dut.mantissaWidth+2) / y + val remain = (x << dut.mantissaWidth+2) % y + + assert(p.result.toLong == result, f"$x%x/$y%x=${p.result.toLong}%x instead of $result%x") + assert(p.remain.toLong == remain, f"$x%x %% $y%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 + test(0xE000 << s, 0x8000 << s) + test(0xC000 << s, 0x4000 << s) + test(0xC835 << s, 0x4742 << s) + test(0,0) + test(0,f) + test(f,0) + test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(1 << dut.mantissaWidth), Random.nextInt(1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +} \ No newline at end of file diff --git a/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala b/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala new file mode 100644 index 0000000..0f80905 --- /dev/null +++ b/src/main/scala/vexriscv/ip/fpu/FpuSqrt.scala @@ -0,0 +1,116 @@ +package vexriscv.ip.fpu + +import spinal.core._ +import spinal.lib._ +import spinal.lib.sim.{StreamDriver, StreamMonitor, StreamReadyRandomizer} + +import scala.collection.mutable +import scala.util.Random + +case class FpuSqrtCmd(mantissaWidth : Int) extends Bundle{ + val a = UInt(mantissaWidth+2 bits) +} + +case class FpuSqrtRsp(mantissaWidth : Int) extends Bundle{ + val result = UInt(mantissaWidth+1 bits) + val remain = UInt(mantissaWidth+5 bits) +} + +case class FpuSqrt(val mantissaWidth : Int) extends Component { + val io = new Bundle{ + val input = slave Stream(FpuSqrtCmd(mantissaWidth)) + val output = master Stream(FpuSqrtRsp(mantissaWidth)) + } + + val iterations = mantissaWidth+2 + val counter = Reg(UInt(log2Up(iterations ) bits)) + val busy = RegInit(False) clearWhen(io.output.fire) + val done = RegInit(False) setWhen(busy && counter === iterations-1) clearWhen(io.output.fire) + + val a = Reg(UInt(mantissaWidth+5 bits)) + val x = Reg(UInt(mantissaWidth bits)) + val q = Reg(UInt(mantissaWidth+1 bits)) + val t = a-(q @@ U"01") + + + io.output.valid := done + io.output.result := (q << 0).resized + io.output.remain := a + io.input.ready := !busy + + when(!done){ + counter := counter + 1 + val sel = CombInit(a) + when(!t.msb){ + sel := t.resized + } + q := (q @@ !t.msb).resized + a := (sel @@ x(widthOf(x)-2,2 bits)).resized + x := x |<< 2 + } + + when(!busy){ + q := 0 + a := io.input.a(widthOf(io.input.a)-2,2 bits).resized + x := (io.input.a).resized + counter := 0 + when(io.input.valid){ + busy := True + } + } +} + + +object FpuSqrtTester extends App{ + import spinal.core.sim._ + + for(w <- List(16)) { + val config = SimConfig + config.withFstWave + config.compile(new FpuSqrt(w)).doSim(seed=2){dut => + dut.clockDomain.forkStimulus(10) + + + val (cmdDriver, cmdQueue) = StreamDriver.queue(dut.io.input, dut.clockDomain) + val rspQueue = mutable.Queue[FpuSqrtRsp => Unit]() + StreamMonitor(dut.io.output, dut.clockDomain)( rspQueue.dequeue()(_)) + StreamReadyRandomizer(dut.io.output, dut.clockDomain) + + def test(a : Int): Unit ={ + cmdQueue +={p => + p.a #= a + } + rspQueue += {p => +// val x = (a * (1l << dut.mantissaWidth)).toLong +// val result = Math.sqrt(x).toLong/(1 << dut.mantissaWidth/2) +// val remain = a-x*x + val x = a.toDouble / (1 << dut.mantissaWidth) + val result = (Math.sqrt(x)*(1 << dut.mantissaWidth+1)).toLong + val filtred = result % (1 << dut.mantissaWidth+1) +// val remain = (a-(result*result)).toLong + assert(p.result.toLong == filtred, f"$a%x=${p.result.toLong}%x instead of $filtred%x") +// assert(p.remain.toLong == remain, f"$a%x=${p.remain.toLong}%x instead of $remain%x") + } + } + + val s = dut.mantissaWidth-16 + val f = (1 << dut.mantissaWidth)-1 +// test(121) + test(0x20000) + test(0x18000) +// test(0,0) +// test(0,f) +// test(f,0) +// test(f,f) + + for(i <- 0 until 10000){ + test(Random.nextInt(3 << dut.mantissaWidth) + (1 << dut.mantissaWidth)) + } + + waitUntil(rspQueue.isEmpty) + + dut.clockDomain.waitSampling(100) + + } + } +} \ No newline at end of file diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala index dd0d2f0..e5a0272 100644 --- a/src/main/scala/vexriscv/ip/fpu/Interface.scala +++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -50,7 +50,7 @@ case class FpuFloat(exponentSize: Int, def isInfinity = special && exponent(1 downto 0) === FpuFloat.INFINITY def isNan = special && exponent(1 downto 0) === FpuFloat.NAN def isQuiet = mantissa.msb - def isNanSignaling = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet + def isNanSignaling = special && exponent(1 downto 0) === FpuFloat.NAN && !isQuiet def isCanonical = exponent(FpuFloat.NAN_CANONICAL_BIT) def setNormal = { special := False } @@ -118,10 +118,13 @@ object FpuRoundModeInstr extends SpinalEnum(){ case class FpuParameter( withDouble : Boolean, mulWidthA : Int = 18, mulWidthB : Int = 18, + rfLockCount : Int = 8, sim : Boolean = false, withAdd : Boolean = true, withMul : Boolean = true, - withDivSqrt : Boolean = true, + withDivSqrt : Boolean = false, + withDiv : Boolean = true, + withSqrt : Boolean = true, withShortPipMisc : Boolean = true){ val internalMantissaSize = if(withDouble) 52 else 23 diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala index dafac84..5b1a801 100644 --- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala +++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala @@ -55,11 +55,11 @@ class FpuTest extends FunSuite{ } def testP(p : FpuParameter){ - val portCount = 4 + val portCount = 1 val config = SimConfig config.allOptimisation -// if(p.withDouble) config.withFstWave +// config.withFstWave config.compile(new FpuCore(portCount, p){ for(i <- 0 until portCount) out(Bits(5 bits)).setName(s"flagAcc$i") := io.port(i).completion.flags.asBits setDefinitionName("FpuCore"+ (if(p.withDouble) "Double" else "")) @@ -724,58 +724,34 @@ class FpuTest extends FunSuite{ } } - def testSqrtExact(a : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={ - val rs = new RegAllocator() - val rs1, rs2, rs3 = rs.allocate() - val rd = Random.nextInt(32) - load(rs1, a) - - sqrt(rd,rs1, FpuRoundMode.RNE, FpuFormat.FLOAT) - storeFloat(rd){v => - val error = Math.abs(ref-v)/ref - assert(checkFloat(ref, v), f"sqrt($a) = $v, $ref $error $rounding") - } - } - - def testDivExact(a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={ - val rs = new RegAllocator() - val rs1, rs2, rs3 = rs.allocate() - val rd = Random.nextInt(32) - load(rs1, a) - load(rs2, b) - - div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.FLOAT) - storeFloat(rd){v => - val error = Math.abs(ref-v)/ref - assert(checkFloat(ref, v), f"div($a, $b) = $v, $ref $error $rounding") - } - } - def testSqrtF64Exact(a : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={ val rs = new RegAllocator() val rs1, rs2, rs3 = rs.allocate() val rd = Random.nextInt(32) load(rs1, a) - sqrt(rd,rs1, FpuRoundMode.RNE, FpuFormat.DOUBLE) + sqrt(rd,rs1, rounding, FpuFormat.DOUBLE) + store(rd){v => - val error = Math.abs(ref-v)/ref - assert(checkDouble(ref, v), f"sqrt($a) = $v, $ref $error $rounding") + assert(d2b(v) == d2b(ref), f"## sqrt${a} = $v, $ref $rounding, ${d2b(a).toString(16)} ${d2b(ref).toString(16)}") } + + flagMatch(flag, ref, f"## sqrt${a} $ref $rounding") } - def testDivF64Exact(a : Double, b : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={ + def testSqrtExact(a : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={ val rs = new RegAllocator() val rs1, rs2, rs3 = rs.allocate() val rd = Random.nextInt(32) load(rs1, a) - load(rs2, b) - div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.DOUBLE) - store(rd){v => - val error = Math.abs(ref-v)/ref - assert(checkDouble(ref, v), f"div($a, $b) = $v, $ref $error $rounding") + sqrt(rd,rs1, rounding, FpuFormat.FLOAT) + + storeFloat(rd){v => + assert(d2b(v) == d2b(ref), f"## sqrt${a} = $v, $ref $rounding, ${f2b(a).toString()} ${f2b(ref).toString()}") } + + flagMatch(flag, ref, f"## sqrt${a} $ref $rounding") } @@ -1108,8 +1084,7 @@ class FpuTest extends FunSuite{ def testDiv() : Unit = { val rounding = FpuRoundMode.elements.randomPick() val (a,b,r,f) = f32.div(rounding).f32_f32_f32 - testDivExact(a, b, r, f, rounding) - flagClear() + testBinaryOp(div, a, b, r, f, rounding, "div") } def testSqrt() : Unit = { @@ -1132,7 +1107,8 @@ class FpuTest extends FunSuite{ def testDivF64() : Unit = { val rounding = FpuRoundMode.elements.randomPick() val (a,b,r,f) = f64.div(rounding).f64_f64_f64 - testDivF64Exact(a, b, r, f, rounding) + // testDivF64Exact(a, b, r, f, rounding) + testBinaryOpF64(div, a, b, r, f,rounding, "div") flagClear() } @@ -1280,22 +1256,34 @@ class FpuTest extends FunSuite{ var fxxTests = f32Tests if(p.withDouble) fxxTests ++= f64Tests - - + + for(_ <- 0 until 10000) testDiv() + println("f32 div done") + + for(_ <- 0 until 10000) testSqrt() + println("f32 sqrt done") + + + + //TODO test boxing //TODO double <-> simple convertions if(p.withDouble) { - load(0, 1.0) - load(0, 2.0) - load(0, 2.5) - load(0, 0.75) - load(0, -5) - load(0, 0) - load(0, Double.PositiveInfinity) - load(0, Double.NaN) - dut.clockDomain.waitSampling(200) - simSuccess() + testSqrtF64Exact(1.25*1.25, 1.25, 0, FpuRoundMode.RNE) + testSqrtF64Exact(1.5*1.5, 1.5, 0, FpuRoundMode.RNE) + + for(_ <- 0 until 10000) testSqrtF64() + println("f64 sqrt done") + +// testDivF64Exact(1.0, 8.0, 0.125, 0, FpuRoundMode.RNE) +// testDivF64Exact(4.0, 8.0, 0.5, 0, FpuRoundMode.RNE) +// testDivF64Exact(8.0, 8.0, 1.0, 0, FpuRoundMode.RNE) +// testDivF64Exact(1.5, 2.0, 0.75, 0, FpuRoundMode.RNE) +// testDivF64Exact(1.875, 1.5, 1.25, 0, FpuRoundMode.RNE) + + for(_ <- 0 until 10000) testDivF64() + println("f64 div done") for(_ <- 0 until 10000) testSgnjF64() println("f64 sgnj done") @@ -1338,12 +1326,6 @@ class FpuTest extends FunSuite{ println("f64 Cmp done") - for(_ <- 0 until 10000) testDivF64() - println("f64 div done") - - for(_ <- 0 until 10000) testSqrtF64() - println("f64 sqrt done") - for(_ <- 0 until 10000) testClassF64() println("f64 class done") //