From 1d0eecdcb0480534fe10217ce0554d3a74c3ca58 Mon Sep 17 00:00:00 2001
From: Dolu1990 <charles.papon.90@gmail.com>
Date: Wed, 3 Feb 2021 14:27:52 +0100
Subject: [PATCH] fpu f2i rounding ok and full shifter

---
 src/main/scala/vexriscv/ip/fpu/FpuCore.scala  |  91 +++++---
 .../scala/vexriscv/ip/fpu/Interface.scala     |   3 +-
 src/test/scala/vexriscv/ip/fpu/FpuTest.scala  | 212 ++++++++++--------
 3 files changed, 179 insertions(+), 127 deletions(-)

diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
index 2b8f9f2..4614370 100644
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -57,13 +57,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
   case class ShortPipInput() extends Bundle{
     val source = Source()
     val opcode = p.Opcode()
-    val rs2 = p.internalFloating()
-    val rs1Raw = Bits(widthOf(rs2) bits)
+    val rs1, rs2 = p.internalFloating()
     val lockId = lockIdType()
     val rd = p.rfAddress()
     val value = Bits(32 bits)
     val arg = Bits(2 bits)
-    def rs1 = rs1Raw.as(p.internalFloating)
     val roundMode = FpuRoundMode()
   }
 
@@ -261,7 +259,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     input.ready setWhen(shortPipHit && shortPip.ready)
     shortPip.valid := input.valid && shortPipHit
     shortPip.payload.assignSomeByName(read.output.payload)
-    shortPip.rs1Raw := read.output.rs1.asBits
 
     val divSqrtHit = input.opcode === p.Opcode.DIV ||  input.opcode === p.Opcode.SQRT
     val divSqrt = Stream(DivSqrtInput())
@@ -461,49 +458,46 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       val f2iShift = input.rs1.exponent - U(exponentOne)
       val isF2i = input.opcode === FpuOpcode.F2I
       val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR && isSubnormal
-      val manTop = Reg(UInt(log2Up(p.internalMantissaSize) bits))
-      val counter = Reg(UInt(log2Up(p.internalMantissaSize+1) bits))
       val done, boot = Reg(Bool())
       val isZero = input.rs1.isZero// || input.rs1.exponent < exponentOne-1
-      val overflow = input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) && !input.rs1.sign
-      val underflow = input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne-1)) && input.rs1.sign  // && !(input.arg(0) && input.rs1.exponent === exponentOne-31 && input.rs)
+
+      val shift = new Area{
+        val by = Reg(UInt(log2Up(p.internalMantissaSize max 33) bits))
+        val input = UInt(p.internalMantissaSize max 33 bits).assignDontCare()
+        var logic = input
+        val scrap = Reg(Bool)
+        for(i <- by.range){
+          scrap setWhen(by(i) && logic(0, 1 << i bits) =/= 0)
+          logic \= by(i) ? (logic |>> (BigInt(1) << i)) | logic
+        }
+        when(boot){
+          scrap := False
+        }
+        val output = RegNextWhen(logic, !done)
+      }
+
+      shift.input := (U(!isZero) @@ input.rs1.mantissa) << 9
+
 
       when(input.valid && (needRecoding || isF2i) && !done){
         halt := True
         when(boot){
           when(isF2i){
-            when(underflow || overflow){
-              done := True
-              val low = overflow
-              val high = input.arg(0) ^ overflow
-              input.rs1Raw.getDrivingReg(0, 32 bits) := (31 -> high, default -> low)
-            } otherwise {
-              manTop := (U(exponentOne + 31) - input.rs1.exponent).resized //TODO merge
-              input.rs1Raw.getDrivingReg(0, 32 bits) := input.rs1Raw(0, 23 bits) << 9
-            }
+            shift.by := (U(exponentOne + 31) - input.rs1.exponent).min(U(33)).resized //TODO merge
           } otherwise {
-            manTop := (U(exponentOne - 127) - recoded.exponent).resized
+            shift.by := (U(exponentOne - 127+10) - recoded.exponent).resized
           }
           boot := False
-
         } otherwise {
-          when(isF2i){
-            input.rs1Raw.getDrivingReg(0, 32 bits) := (B(counter === 0 && !isZero) ## input.rs1Raw(0, 32 bits)) >> 1
-          } otherwise {
-            input.rs1Raw.getDrivingReg(0, 23 bits) := (B(counter === 0) ## input.rs1Raw(0, 23 bits)) >> 1
-          }
-          counter := counter + 1
-          when(counter === manTop) {
-            done := True
-          }
+          done := True
         }
       }
 
       when(isSubnormal){
         f32.exp := 0
+        f32.man := shift.output(22 downto 0)
       }
       when(!input.isStall){
-        counter := 0
         done := False
         boot := True
       }
@@ -526,12 +520,30 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     }
 
 
-//    val f2iShift = input.rs1.exponent - U(exponentOne)
-//    val f2iShifted = (U"1" @@ input.rs1.mantissa) << (f2iShift.resize(5 bits))
-//    val f2iUnsigned = f2iShifted >> p.internalMantissaSize
-//    val f2iResult = (f2iUnsigned.twoComplement(input.arg(0) && input.rs1.sign)).asBits.resize(32 bits)
-    val f2iUnsigned = input.rs1Raw(0, 32 bits).asUInt
-    val f2iResult = (f2iUnsigned.twoComplement(input.arg(0) && input.rs1.sign)).asBits.resize(32 bits)
+
+    val f2i = new Area{ //Will not work for 64 bits float max value rounding
+      val unsigned = fsm.shift.output >> 1
+      val resign = input.arg(0) && input.rs1.sign
+      val round = fsm.shift.output(0) ## fsm.shift.scrap
+      val increment = input.roundMode.mux(
+        FpuRoundMode.RNE -> (round(1) && (round(0) || unsigned(0))),
+        FpuRoundMode.RTZ -> False,
+        FpuRoundMode.RDN -> (round =/= 0 &&  input.rs1.sign),
+        FpuRoundMode.RUP -> (round =/= 0 && !input.rs1.sign),
+        FpuRoundMode.RMM -> (round(1))
+      )
+      val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt)
+      val overflow  = RegNext((input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan)
+      val underflow = RegNext((input.rs1.exponent > U(exponentOne+30) || !input.arg(0) || input.rs1.isInfinity) && input.rs1.sign)
+      val isZero = input.rs1.isZero
+      when(isZero){
+        result := 0
+      } elsewhen(underflow || overflow) {
+        val low = overflow
+        val high = input.arg(0) ^ overflow
+        result := (31 -> high, default -> low)
+      }
+    }
 
     val bothZero = input.rs1.isZero && input.rs2.isZero
     val rs1Equal = input.rs1 === input.rs2
@@ -569,7 +581,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     switch(input.opcode){
       is(FpuOpcode.STORE)   { result := recodedResult }
       is(FpuOpcode.FMV_X_W) { result := recodedResult } //TODO
-      is(FpuOpcode.F2I)     { result := f2iResult }
+      is(FpuOpcode.F2I)     { result := f2i.result.asBits }
       is(FpuOpcode.CMP)     { result := cmpResult.resized } //TODO
       is(FpuOpcode.FCLASS)  { result := fclassResult.resized }
     }
@@ -1057,6 +1069,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     port.address := input.source @@ input.rd
     port.data := input.value
 
+    if(p.sim) when(port.data.isZero || port.data.isInfinity){
+      port.data.mantissa.assignDontCare()
+    }
+    if(p.sim) when(port.data.special){
+      port.data.exponent(p.internalExponentSize-1 downto 2).assignDontCare()
+    }
+
     when(port.valid){
       assert(!(port.data.exponent === 0 && !port.data.special), "Special violation")
       assert(!(port.data.exponent === port.data.exponent.maxValue && !port.data.special), "Special violation")
diff --git a/src/main/scala/vexriscv/ip/fpu/Interface.scala b/src/main/scala/vexriscv/ip/fpu/Interface.scala
index 3c25ad9..dff4779 100644
--- a/src/main/scala/vexriscv/ip/fpu/Interface.scala
+++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala
@@ -106,7 +106,8 @@ object FpuRoundModeInstr extends SpinalEnum(){
 
 
 case class FpuParameter( internalMantissaSize : Int,
-                         withDouble : Boolean){
+                         withDouble : Boolean,
+                         sim : Boolean = false){
 
   val storeLoadType = HardType(Bits(if(withDouble) 64 bits else 32 bits))
   val internalExponentSize = (if(withDouble) 11 else 8) + 1
diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
index 6e30e71..e3a6125 100644
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@@ -32,7 +32,8 @@ class FpuTest extends FunSuite{
     val portCount = 1
     val p = FpuParameter(
       internalMantissaSize = 23,
-      withDouble = false
+      withDouble = false,
+      sim = true
     )
 
     val config = SimConfig
@@ -46,16 +47,19 @@ class FpuTest extends FunSuite{
       class TestCase(op : String){
         def build(arg : String) = new ProcessStream(s"testfloat_gen $arg -forever -$op"){
           def f32_2 ={
-            val l = next
-            val s = new Scanner(l)
+            val s = new Scanner(next)
             (b2f(s.nextLong(16).toInt), b2f(s.nextLong(16).toInt), b2f(s.nextLong(16).toInt), s.nextInt(16))
           }
 
           def i32_f32 ={
-            val l = next
-            val s = new Scanner(l)
+            val s = new Scanner(next)
             (s.nextLong(16).toInt, b2f(s.nextLong(16).toInt), s.nextInt(16))
           }
+
+          def f32_i32 = {
+            val s = new Scanner(next)
+            (b2f(s.nextLong(16).toInt), s.nextLong(16).toInt, s.nextInt(16))
+          }
         }
         val RNE = build("-rnear_even")
         val RTZ = build("-rminMag")
@@ -75,10 +79,12 @@ class FpuTest extends FunSuite{
 
       val f32 = new {
         val add = new TestCase("f32_add")
+        val sub = new TestCase("f32_sub")
         val mul = new TestCase("f32_mul")
         val ui2f = new TestCase("ui32_to_f32")
         val i2f = new TestCase("i32_to_f32")
         val f2ui = new TestCase("f32_to_ui32")
+        val f2i = new TestCase("f32_to_i32")
       }
 
       val cpus = for(id <- 0 until portCount) yield new {
@@ -147,14 +153,14 @@ class FpuTest extends FunSuite{
           storeRaw(rs){rsp => body(b2f(rsp.value.toLong.toInt))}
         }
 
-        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
           cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.MUL
+            cmd.opcode #= opcode
             cmd.rs1 #= rs1
             cmd.rs2 #= rs2
             cmd.rs3.randomize()
             cmd.rd #= rd
-            cmd.arg #= 0
+            cmd.arg #= arg
             cmd.roundMode #= rounding
           }
           commitQueue += {cmd =>
@@ -163,90 +169,51 @@ class FpuTest extends FunSuite{
           }
         }
 
+        def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
+          cmdQueue += {cmd =>
+            cmd.opcode #= opcode
+            cmd.rs1 #= rs1
+            cmd.rs2 #= rs2
+            cmd.rs3.randomize()
+            cmd.rd.randomize()
+            cmd.arg #= arg
+            cmd.roundMode #= rounding
+          }
+          rspQueue += body
+        }
+
+
+        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding)
+        }
+
         def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.ADD
-            cmd.rs1 #= rs1
-            cmd.rs2 #= rs2
-            cmd.rs3.randomize()
-            cmd.rd #= rd
-            cmd.arg #= 0
-            cmd.roundMode #= rounding
-          }
-          commitQueue += {cmd =>
-            cmd.write #= true
-            cmd.sync #= false
-          }
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding)
         }
 
-        def div(rd : Int, rs1 : Int, rs2 : Int): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.DIV
-            cmd.rs1 #= rs1
-            cmd.rs2 #= rs2
-            cmd.rs3.randomize()
-            cmd.rd #= rd
-            cmd.arg.randomize()
-          }
-          commitQueue += {cmd =>
-            cmd.write #= true
-            cmd.sync #= false
-          }
+        def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding)
         }
 
-        def sqrt(rd : Int, rs1 : Int): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.SQRT
-            cmd.rs1 #= rs1
-            cmd.rs2.randomize()
-            cmd.rs3.randomize()
-            cmd.rd #= rd
-            cmd.arg.randomize()
-          }
-          commitQueue += {cmd =>
-            cmd.write #= true
-            cmd.sync #= false
-          }
+        def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding)
         }
 
-        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.FMA
-            cmd.rs1 #= rs1
-            cmd.rs2 #= rs2
-            cmd.rs3 #= rs3
-            cmd.rd #= rd
-            cmd.arg #= 0
-          }
-          commitQueue += {cmd =>
-            cmd.write #= true
-            cmd.sync #= false
-          }
+        def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+          fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding)
+        }
+
+        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+          fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding)
         }
 
 
         def cmp(rs1 : Int, rs2 : Int)(body : FpuRsp => Unit): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.CMP
-            cmd.rs1 #= rs1
-            cmd.rs2 #= rs2
-            cmd.rs3.randomize()
-            cmd.rd.randomize()
-            cmd.arg #= 1
-          }
-          rspQueue += body
+          fpuF2i(rs1, rs2, FpuOpcode.CMP, 1, FpuRoundMode.elements.randomPick())(body)
         }
 
-        def f2i(rs1 : Int, signed : Boolean)(body : FpuRsp => Unit): Unit ={
-          cmdQueue += {cmd =>
-            cmd.opcode #= cmd.opcode.spinalEnum.F2I
-            cmd.rs1 #= rs1
-            cmd.rs2.randomize()
-            cmd.rs3.randomize()
-            cmd.rd.randomize()
-            cmd.arg #= (if(signed) 1 else 0)
-          }
-          rspQueue += body
+        def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
+          fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding)(body)
         }
 
         def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
@@ -388,6 +355,18 @@ class FpuTest extends FunSuite{
           }
         }
 
+        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          op(rd,rs1,rs2, rounding)
+          storeFloat(rd){v =>
+            assert(f2b(v) == f2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
+          }
+        }
+
         def testAddExact(a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={
           val rs = new RegAllocator()
           val rs1, rs2, rs3 = rs.allocate()
@@ -517,6 +496,31 @@ class FpuTest extends FunSuite{
           }
         }
 
+        def testF2iExact(a : Float, ref : Int, flag : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
+          val rs = new RegAllocator()
+          val rs1 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          f2i(rs1, signed, rounding){rsp =>
+            if(signed) {
+              val v = rsp.value.toLong.toInt
+              var ref2 = ref
+              if(a >= Int.MaxValue) ref2 = Int.MaxValue
+              if(a <= Int.MinValue) ref2 = Int.MinValue
+              if(a.isNaN) ref2 = Int.MaxValue
+              assert(v == (ref2), f" <= f2i($a) = $v, $ref2, $rounding, $flag")
+            } else {
+              val v = rsp.value.toLong
+              var ref2 = ref.toLong & 0xFFFFFFFFl
+              if(a < 0) ref2 = 0
+              if(a >= 0xFFFFFFFFl) ref2 = 0xFFFFFFFFl
+              if(a.isNaN) ref2 = 0xFFFFFFFFl
+              assert(v == ref2, f" <= f2ui($a) = $v, $ref2, $rounding $flag")
+            }
+          }
+        }
+
+
         def testI2f(a : Int, signed : Boolean): Unit ={
           val rs = new RegAllocator()
           val rd = Random.nextInt(32)
@@ -538,7 +542,7 @@ class FpuTest extends FunSuite{
             val aLong = if(signed) a.toLong else a.toLong & 0xFFFFFFFFl
             val ref = b
 //            println(f"i2f($aLong) = $v, $ref")
-            assert(f2b(v) == f2b(ref))
+            assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref")
           }
         }
 
@@ -647,6 +651,7 @@ class FpuTest extends FunSuite{
         }
 
 
+
 //        for(i <- 0 until 64){
 //          val rounding = FpuRoundMode.RMM
 //          val a = 24f
@@ -656,36 +661,63 @@ class FpuTest extends FunSuite{
 //          testMulExact(a,b,c,f, rounding)
 //        }
 
-        for(_ <- 0 until 100000){
+        val binaryOps = List[(Int,Int,Int,FpuRoundMode.E) => Unit](add, sub, mul)
+
+
+
+        for(_ <- 0 until 10000){
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.i2f(rounding).i32_f32
           testI2fExact(a,b,f, true, rounding)
         }
-        for(_ <- 0 until 100000){
+
+        for(_ <- 0 until 10000){
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.ui2f(rounding).i32_f32
           testI2fExact(a,b,f, false, rounding)
         }
         println("i2f done")
 
+        for(_ <- 0 until 10000){
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,f) = f32.f2ui(rounding).f32_i32
+          testF2iExact(a,b, f, false, rounding)
+        }
 
-        for(_ <- 0 until 100000){
+        for(_ <- 0 until 10000){
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,f) = f32.f2i(rounding).f32_i32
+          testF2iExact(a,b, f, true, rounding)
+        }
+
+        println("f2i done")
+
+
+        for(_ <- 0 until 10000){
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,c,f) = f32.add(rounding).f32_2
+          testBinaryOp(add,a,b,c,f, rounding,"add")
+        }
+
+        for(_ <- 0 until 10000){
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,c,f) = f32.sub(rounding).f32_2
+          testBinaryOp(sub,a,b,c,f, rounding,"sub")
+        }
+
+        println("Add done")
+
+        for(_ <- 0 until 10000){
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,c,f) = f32.mul(rounding).f32_2
-          testMulExact(a,b,c,f, rounding)
+          testBinaryOp(mul,a,b,c,f, rounding,"mul")
         }
 
         println("Mul done")
 
 
 
-        for(_ <- 0 until 100000){
-          val rounding = FpuRoundMode.elements.randomPick()
-          val (a,b,c,f) = f32.add(rounding).f32_2
-          testAddExact(a,b,c,f, rounding)
-        }
 
-        println("Add done")
 
         waitUntil(cmdQueue.isEmpty)
         dut.clockDomain.waitSampling(1000)