From 1d0eecdcb0480534fe10217ce0554d3a74c3ca58 Mon Sep 17 00:00:00 2001 From: Dolu1990 Date: Wed, 3 Feb 2021 14:27:52 +0100 Subject: [PATCH] fpu f2i rounding ok and full shifter --- src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 91 +++++--- .../scala/vexriscv/ip/fpu/Interface.scala | 3 +- src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 212 ++++++++++-------- 3 files changed, 179 insertions(+), 127 deletions(-) diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala index 2b8f9f2..4614370 100644 --- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala +++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala @@ -57,13 +57,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ case class ShortPipInput() extends Bundle{ val source = Source() val opcode = p.Opcode() - val rs2 = p.internalFloating() - val rs1Raw = Bits(widthOf(rs2) bits) + val rs1, rs2 = p.internalFloating() val lockId = lockIdType() val rd = p.rfAddress() val value = Bits(32 bits) val arg = Bits(2 bits) - def rs1 = rs1Raw.as(p.internalFloating) val roundMode = FpuRoundMode() } @@ -261,7 +259,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ input.ready setWhen(shortPipHit && shortPip.ready) shortPip.valid := input.valid && shortPipHit shortPip.payload.assignSomeByName(read.output.payload) - shortPip.rs1Raw := read.output.rs1.asBits val divSqrtHit = input.opcode === p.Opcode.DIV || input.opcode === p.Opcode.SQRT val divSqrt = Stream(DivSqrtInput()) @@ -461,49 +458,46 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ val f2iShift = input.rs1.exponent - U(exponentOne) val isF2i = input.opcode === FpuOpcode.F2I val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal - val manTop = Reg(UInt(log2Up(p.internalMantissaSize) bits)) - val counter = Reg(UInt(log2Up(p.internalMantissaSize+1) bits)) val done, boot = Reg(Bool()) val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1 - val overflow = input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) && !input.rs1.sign - val underflow = input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne-1)) && input.rs1.sign // && !(input.arg(0) && input.rs1.exponent === exponentOne-31 && input.rs) + + val shift = new Area{ + val by = Reg(UInt(log2Up(p.internalMantissaSize max 33) bits)) + val input = UInt(p.internalMantissaSize max 33 bits).assignDontCare() + var logic = input + val scrap = Reg(Bool) + for(i <- by.range){ + scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0) + logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic + } + when(boot){ + scrap := False + } + val output = RegNextWhen(logic, !done) + } + + shift.input := (U(!isZero) @@ input.rs1.mantissa) << 9 + when(input.valid && (needRecoding || isF2i) && !done){ halt := True when(boot){ when(isF2i){ - when(underflow || overflow){ - done := True - val low = overflow - val high = input.arg(0) ^ overflow - input.rs1Raw.getDrivingReg(0, 32 bits) := (31 -> high, default -> low) - } otherwise { - manTop := (U(exponentOne + 31) - input.rs1.exponent).resized //TODO merge - input.rs1Raw.getDrivingReg(0, 32 bits) := input.rs1Raw(0, 23 bits) << 9 - } + shift.by := (U(exponentOne + 31) - input.rs1.exponent).min(U(33)).resized //TODO merge } otherwise { - manTop := (U(exponentOne - 127) - recoded.exponent).resized + shift.by := (U(exponentOne - 127+10) - recoded.exponent).resized } boot := False - } otherwise { - when(isF2i){ - input.rs1Raw.getDrivingReg(0, 32 bits) := (B(counter === 0 && !isZero) ## input.rs1Raw(0, 32 bits)) >> 1 - } otherwise { - input.rs1Raw.getDrivingReg(0, 23 bits) := (B(counter === 0) ## input.rs1Raw(0, 23 bits)) >> 1 - } - counter := counter + 1 - when(counter === manTop) { - done := True - } + done := True } } when(isSubnormal){ f32.exp := 0 + f32.man := shift.output(22 downto 0) } when(!input.isStall){ - counter := 0 done := False boot := True } @@ -526,12 +520,30 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ } -// val f2iShift = input.rs1.exponent - U(exponentOne) -// val f2iShifted = (U"1" @@ input.rs1.mantissa) << (f2iShift.resize(5 bits)) -// val f2iUnsigned = f2iShifted >> p.internalMantissaSize -// val f2iResult = (f2iUnsigned.twoComplement(input.arg(0) && input.rs1.sign)).asBits.resize(32 bits) - val f2iUnsigned = input.rs1Raw(0, 32 bits).asUInt - val f2iResult = (f2iUnsigned.twoComplement(input.arg(0) && input.rs1.sign)).asBits.resize(32 bits) + + val f2i = new Area{ //Will not work for 64 bits float max value rounding + val unsigned = fsm.shift.output >> 1 + val resign = input.arg(0) && input.rs1.sign + val round = fsm.shift.output(0) ## fsm.shift.scrap + val increment = input.roundMode.mux( + FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))), + FpuRoundMode.RTZ -> False, + FpuRoundMode.RDN -> (round =/= 0 && input.rs1.sign), + FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign), + FpuRoundMode.RMM -> (round(1)) + ) + val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt) + val overflow = RegNext((input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan) + val underflow = RegNext((input.rs1.exponent > U(exponentOne+30) || !input.arg(0) || input.rs1.isInfinity) && input.rs1.sign) + val isZero = input.rs1.isZero + when(isZero){ + result := 0 + } elsewhen(underflow || overflow) { + val low = overflow + val high = input.arg(0) ^ overflow + result := (31 -> high, default -> low) + } + } val bothZero = input.rs1.isZero && input.rs2.isZero val rs1Equal = input.rs1 === input.rs2 @@ -569,7 +581,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ switch(input.opcode){ is(FpuOpcode.STORE) { result := recodedResult } is(FpuOpcode.FMV_X_W) { result := recodedResult } //TODO - is(FpuOpcode.F2I) { result := f2iResult } + is(FpuOpcode.F2I) { result := f2i.result.asBits } is(FpuOpcode.CMP) { result := cmpResult.resized } //TODO is(FpuOpcode.FCLASS) { result := fclassResult.resized } } @@ -1057,6 +1069,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{ port.address := input.source @@ input.rd port.data := input.value + if(p.sim) when(port.data.isZero || port.data.isInfinity){ + port.data.mantissa.assignDontCare() + } + if(p.sim) when(port.data.special){ + port.data.exponent(p.internalExponentSize-1 downto 2).assignDontCare() + } + when(port.valid){ assert(!(port.data.exponent === 0 && !port.data.special), "Special violation") assert(!(port.data.exponent === port.data.exponent.maxValue && !port.data.special), "Special violation") diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala index 3c25ad9..dff4779 100644 --- a/src/main/scala/vexriscv/ip/fpu/Interface.scala +++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala @@ -106,7 +106,8 @@ object FpuRoundModeInstr extends SpinalEnum(){ case class FpuParameter( internalMantissaSize : Int, - withDouble : Boolean){ + withDouble : Boolean, + sim : Boolean = false){ val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits)) val internalExponentSize = (if(withDouble) 11 else 8) + 1 diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala index 6e30e71..e3a6125 100644 --- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala +++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala @@ -32,7 +32,8 @@ class FpuTest extends FunSuite{ val portCount = 1 val p = FpuParameter( internalMantissaSize = 23, - withDouble = false + withDouble = false, + sim = true ) val config = SimConfig @@ -46,16 +47,19 @@ class FpuTest extends FunSuite{ class TestCase(op : String){ def build(arg : String) = new ProcessStream(s"testfloat_gen $arg -forever -$op"){ def f32_2 ={ - val l = next - val s = new Scanner(l) + val s = new Scanner(next) (b2f(s.nextLong(16).toInt), b2f(s.nextLong(16).toInt), b2f(s.nextLong(16).toInt), s.nextInt(16)) } def i32_f32 ={ - val l = next - val s = new Scanner(l) + val s = new Scanner(next) (s.nextLong(16).toInt, b2f(s.nextLong(16).toInt), s.nextInt(16)) } + + def f32_i32 = { + val s = new Scanner(next) + (b2f(s.nextLong(16).toInt), s.nextLong(16).toInt, s.nextInt(16)) + } } val RNE = build("-rnear_even") val RTZ = build("-rminMag") @@ -75,10 +79,12 @@ class FpuTest extends FunSuite{ val f32 = new { val add = new TestCase("f32_add") + val sub = new TestCase("f32_sub") val mul = new TestCase("f32_mul") val ui2f = new TestCase("ui32_to_f32") val i2f = new TestCase("i32_to_f32") val f2ui = new TestCase("f32_to_ui32") + val f2i = new TestCase("f32_to_i32") } val cpus = for(id <- 0 until portCount) yield new { @@ -147,14 +153,14 @@ class FpuTest extends FunSuite{ storeRaw(rs){rsp => body(b2f(rsp.value.toLong.toInt))} } - def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.MUL + cmd.opcode #= opcode cmd.rs1 #= rs1 cmd.rs2 #= rs2 cmd.rs3.randomize() cmd.rd #= rd - cmd.arg #= 0 + cmd.arg #= arg cmd.roundMode #= rounding } commitQueue += {cmd => @@ -163,90 +169,51 @@ class FpuTest extends FunSuite{ } } + def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={ + cmdQueue += {cmd => + cmd.opcode #= opcode + cmd.rs1 #= rs1 + cmd.rs2 #= rs2 + cmd.rs3.randomize() + cmd.rd.randomize() + cmd.arg #= arg + cmd.roundMode #= rounding + } + rspQueue += body + } + + + def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding) + } + def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.ADD - cmd.rs1 #= rs1 - cmd.rs2 #= rs2 - cmd.rs3.randomize() - cmd.rd #= rd - cmd.arg #= 0 - cmd.roundMode #= rounding - } - commitQueue += {cmd => - cmd.write #= true - cmd.sync #= false - } + fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding) } - def div(rd : Int, rs1 : Int, rs2 : Int): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.DIV - cmd.rs1 #= rs1 - cmd.rs2 #= rs2 - cmd.rs3.randomize() - cmd.rd #= rd - cmd.arg.randomize() - } - commitQueue += {cmd => - cmd.write #= true - cmd.sync #= false - } + def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding) } - def sqrt(rd : Int, rs1 : Int): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.SQRT - cmd.rs1 #= rs1 - cmd.rs2.randomize() - cmd.rs3.randomize() - cmd.rd #= rd - cmd.arg.randomize() - } - commitQueue += {cmd => - cmd.write #= true - cmd.sync #= false - } + def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding) } - def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.FMA - cmd.rs1 #= rs1 - cmd.rs2 #= rs2 - cmd.rs3 #= rs3 - cmd.rd #= rd - cmd.arg #= 0 - } - commitQueue += {cmd => - cmd.write #= true - cmd.sync #= false - } + def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding) + } + + def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ + fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding) } def cmp(rs1 : Int, rs2 : Int)(body : FpuRsp => Unit): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.CMP - cmd.rs1 #= rs1 - cmd.rs2 #= rs2 - cmd.rs3.randomize() - cmd.rd.randomize() - cmd.arg #= 1 - } - rspQueue += body + fpuF2i(rs1, rs2, FpuOpcode.CMP, 1, FpuRoundMode.elements.randomPick())(body) } - def f2i(rs1 : Int, signed : Boolean)(body : FpuRsp => Unit): Unit ={ - cmdQueue += {cmd => - cmd.opcode #= cmd.opcode.spinalEnum.F2I - cmd.rs1 #= rs1 - cmd.rs2.randomize() - cmd.rs3.randomize() - cmd.rd.randomize() - cmd.arg #= (if(signed) 1 else 0) - } - rspQueue += body + def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={ + fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding)(body) } def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={ @@ -388,6 +355,18 @@ class FpuTest extends FunSuite{ } } + def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={ + val rs = new RegAllocator() + val rs1, rs2, rs3 = rs.allocate() + val rd = Random.nextInt(32) + load(rs1, a) + load(rs2, b) + op(rd,rs1,rs2, rounding) + storeFloat(rd){v => + assert(f2b(v) == f2b(ref), f"## ${a} ${opName} $b = $v, $ref $rounding") + } + } + def testAddExact(a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={ val rs = new RegAllocator() val rs1, rs2, rs3 = rs.allocate() @@ -517,6 +496,31 @@ class FpuTest extends FunSuite{ } } + def testF2iExact(a : Float, ref : Int, flag : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={ + val rs = new RegAllocator() + val rs1 = rs.allocate() + val rd = Random.nextInt(32) + load(rs1, a) + f2i(rs1, signed, rounding){rsp => + if(signed) { + val v = rsp.value.toLong.toInt + var ref2 = ref + if(a >= Int.MaxValue) ref2 = Int.MaxValue + if(a <= Int.MinValue) ref2 = Int.MinValue + if(a.isNaN) ref2 = Int.MaxValue + assert(v == (ref2), f" <= f2i($a) = $v, $ref2, $rounding, $flag") + } else { + val v = rsp.value.toLong + var ref2 = ref.toLong & 0xFFFFFFFFl + if(a < 0) ref2 = 0 + if(a >= 0xFFFFFFFFl) ref2 = 0xFFFFFFFFl + if(a.isNaN) ref2 = 0xFFFFFFFFl + assert(v == ref2, f" <= f2ui($a) = $v, $ref2, $rounding $flag") + } + } + } + + def testI2f(a : Int, signed : Boolean): Unit ={ val rs = new RegAllocator() val rd = Random.nextInt(32) @@ -538,7 +542,7 @@ class FpuTest extends FunSuite{ val aLong = if(signed) a.toLong else a.toLong & 0xFFFFFFFFl val ref = b // println(f"i2f($aLong) = $v, $ref") - assert(f2b(v) == f2b(ref)) + assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref") } } @@ -647,6 +651,7 @@ class FpuTest extends FunSuite{ } + // for(i <- 0 until 64){ // val rounding = FpuRoundMode.RMM // val a = 24f @@ -656,36 +661,63 @@ class FpuTest extends FunSuite{ // testMulExact(a,b,c,f, rounding) // } - for(_ <- 0 until 100000){ + val binaryOps = List[(Int,Int,Int,FpuRoundMode.E) => Unit](add, sub, mul) + + + + for(_ <- 0 until 10000){ val rounding = FpuRoundMode.elements.randomPick() val (a,b,f) = f32.i2f(rounding).i32_f32 testI2fExact(a,b,f, true, rounding) } - for(_ <- 0 until 100000){ + + for(_ <- 0 until 10000){ val rounding = FpuRoundMode.elements.randomPick() val (a,b,f) = f32.ui2f(rounding).i32_f32 testI2fExact(a,b,f, false, rounding) } println("i2f done") + for(_ <- 0 until 10000){ + val rounding = FpuRoundMode.elements.randomPick() + val (a,b,f) = f32.f2ui(rounding).f32_i32 + testF2iExact(a,b, f, false, rounding) + } - for(_ <- 0 until 100000){ + for(_ <- 0 until 10000){ + val rounding = FpuRoundMode.elements.randomPick() + val (a,b,f) = f32.f2i(rounding).f32_i32 + testF2iExact(a,b, f, true, rounding) + } + + println("f2i done") + + + for(_ <- 0 until 10000){ + val rounding = FpuRoundMode.elements.randomPick() + val (a,b,c,f) = f32.add(rounding).f32_2 + testBinaryOp(add,a,b,c,f, rounding,"add") + } + + for(_ <- 0 until 10000){ + val rounding = FpuRoundMode.elements.randomPick() + val (a,b,c,f) = f32.sub(rounding).f32_2 + testBinaryOp(sub,a,b,c,f, rounding,"sub") + } + + println("Add done") + + for(_ <- 0 until 10000){ val rounding = FpuRoundMode.elements.randomPick() val (a,b,c,f) = f32.mul(rounding).f32_2 - testMulExact(a,b,c,f, rounding) + testBinaryOp(mul,a,b,c,f, rounding,"mul") } println("Mul done") - for(_ <- 0 until 100000){ - val rounding = FpuRoundMode.elements.randomPick() - val (a,b,c,f) = f32.add(rounding).f32_2 - testAddExact(a,b,c,f, rounding) - } - println("Add done") waitUntil(cmdQueue.isEmpty) dut.clockDomain.waitSampling(1000)