From b6eda1ad7a1e7611e9221192f6ea79274c22a22b Mon Sep 17 00:00:00 2001
From: Dolu1990 <charles.papon.90@gmail.com>
Date: Thu, 11 Feb 2021 16:07:47 +0100
Subject: [PATCH] fpu f64 load/store/mv/mul seems ok

---
 src/main/scala/vexriscv/ip/fpu/FpuCore.scala | 112 ++++--
 src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 400 +++++++++++++------
 2 files changed, 346 insertions(+), 166 deletions(-)

diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
index dd4352e..edb6dba 100644
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -21,6 +21,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
   val exponentOne = (1 << p.internalExponentSize-1) - 1
   val exponentF32Subnormal = exponentOne-127
   val exponentF64Subnormal = exponentOne-1023
+  val exponentF32Infinity = exponentOne+127+1
+  val exponentF64Infinity = exponentOne+1023+1
 
   val rfLockCount = 5
   val lockIdType = HardType(UInt(log2Up(rfLockCount) bits))
@@ -30,6 +32,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     if(!p.withDouble) no
   }
 
+  def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={
+    if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no })
+    else no
+  }
+
   case class RfReadInput() extends Bundle{
     val source = Source()
     val opcode = p.Opcode()
@@ -254,11 +261,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     output.rs3 := rs3Entry.value
     if(p.withDouble){
       output.format := s1.format
-      when(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
+      val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
+      when(store){ //Pass through
+        output.format := rs1Entry.boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
+      } elsewhen(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
         output.rs1.setNanQuiet
+        output.rs1.sign := False
       }
       when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed){
         output.rs2.setNanQuiet
+        output.rs2.sign := False
       }
       when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed){
         output.rs3.setNanQuiet
@@ -364,7 +376,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       output.i2f := input.i2f
       output.arg := input.arg
       output.roundMode := input.roundMode
-      if(p.withDouble) output.format := input.format
+      if(p.withDouble) {
+        output.format := input.format
+        when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing
+          output.format := FpuFormat.FLOAT
+        }
+      }
+
     }
 
     val s1 = new Area{
@@ -378,25 +396,34 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       }
       val f64 = p.withDouble generate new Area{
         val mantissa = input.value(0, 52 bits).asUInt
-        val exponent = input.value(11, 52 bits).asUInt
+        val exponent = input.value(52, 11 bits).asUInt
         val sign     = input.value(63)
       }
 
+      val recodedExpOffset = UInt(p.internalExponentSize bits)
       val passThroughFloat = p.internalFloating()
       passThroughFloat.special := False
-      passThroughFloat.sign := f32.sign
-      passThroughFloat.exponent := f32.exponent.resized
-      passThroughFloat.mantissa := f32.mantissa << (if(p.withDouble) 29 else 0)
-      if(p.withDouble) when(input.format === FpuFormat.DOUBLE){
+
+      whenDouble(input.format){
         passThroughFloat.sign := f64.sign
         passThroughFloat.exponent := f64.exponent.resized
         passThroughFloat.mantissa := f64.mantissa
+        recodedExpOffset := exponentF64Subnormal
+      } {
+        passThroughFloat.sign := f32.sign
+        passThroughFloat.exponent := f32.exponent.resized
+        passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0)
+        recodedExpOffset := exponentF32Subnormal
       }
 
+
       val manZero = passThroughFloat.mantissa === 0
       val expZero = passThroughFloat.exponent === 0
       val expOne =  passThroughFloat.exponent(7 downto 0).andR
-      if(p.withDouble) expOne.clearWhen(input.format === FpuFormat.DOUBLE && !passThroughFloat.exponent(11 downto 8).andR)
+      if(p.withDouble) {
+        expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0)
+        expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7)
+      }
 
       val isZero      =  expZero &&  manZero
       val isSubnormal =  expZero && !manZero
@@ -409,9 +436,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
         val ohInputWidth = 32 max p.internalMantissaSize
         val ohInput = Bits(ohInputWidth bits).assignDontCare()
         when(!input.i2f) {
-          if(!p.withDouble) ohInput(ohInputWidth-23, 23 bits) := input.value(0, 23 bits)
+          if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9
           if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits
         } otherwise {
+          ohInput(ohInputWidth-32-1 downto 0) := 0
           ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0)
         }
 
@@ -426,15 +454,15 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
           }
           val output = RegNextWhen(logic, !done)
         }
-        shift.input := (input.value.asUInt |<< 1).resized
+        shift.input := (ohInput.asUInt |<< 1).resized
 
-        val subnormalShiftOffset = if(!p.withDouble) U(9) else ((input.format === FpuFormat.DOUBLE) ? U(0) | U(0))
-        val subnormalExpOffset = if(!p.withDouble) U(9) else ((input.format === FpuFormat.DOUBLE)   ? U(0) | U(0))
+        val subnormalShiftOffset = if(!p.withDouble) U(0) else ((input.format === FpuFormat.DOUBLE) ? U(0) | U(0)) //TODO remove ?
+        val subnormalExpOffset = if(!p.withDouble) U(0) else ((input.format === FpuFormat.DOUBLE)   ? U(0) | U(0))
 
         when(input.valid && (input.i2f || isSubnormal) && !done){
           busy := True
           when(boot){
-            when(input.i2f && !patched && input.value.msb && input.arg(0)){
+            when(input.i2f && !patched && input.value(31) && input.arg(0)){
               input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits))
               patched := True
             } otherwise {
@@ -467,7 +495,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 
       val recoded = p.internalFloating()
       recoded.mantissa := passThroughFloat.mantissa
-      recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + exponentF32Subnormal).resized
+      recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized
       recoded.sign     := passThroughFloat.sign
       recoded.setNormal
       when(isZero){recoded.setZero}
@@ -480,9 +508,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       output.roundMode := input.roundMode
       if(p.withDouble) {
         output.format := input.format
-        when(!input.i2f && input.format === FpuFormat.DOUBLE && input.value(63 downto 23).andR){ //Detect boxing
-          output.format := FpuFormat.FLOAT
-        }
       }
       output.rd := input.rd
       output.value.sign      := recoded.sign
@@ -523,9 +548,15 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits)
       val man = CombInit(input.rs1.mantissa)
     }
-    recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
 
-    val expInSubnormalRange = input.rs1.exponent <= exponentOne - 127
+    whenDouble(input.format){
+      recodedResult := input.rs1.sign ## f64.exp ## f64.man
+    } {
+      recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
+    }
+
+    val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+    val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold
     val isSubnormal = !input.rs1.special && expInSubnormalRange
     val isNormal = !input.rs1.special && !expInSubnormalRange
     val fsm = new Area{
@@ -552,14 +583,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 
       shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9)
 
-
+      val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10)))
       when(input.valid && (needRecoding || isF2i) && !done){
         halt := True
         when(boot){
           when(isF2i){
-            shift.by := (U(exponentOne + 31) - input.rs1.exponent).min(U(33)).resized //TODO merge
+            shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge
           } otherwise {
-            shift.by := (U(exponentOne - 127+10) - input.rs1.exponent).resized
+            shift.by := (formatShiftOffset - input.rs1.exponent).resized
           }
           boot := False
         } otherwise {
@@ -619,7 +650,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     when(mantissaForced){
       recodedResult(0,23 bits) := (default -> mantissaForcedValue)
       whenDouble(input.format){
-        recodedResult(52-23, 52-23 bits) := (default -> exponentForcedValue)
+        recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue)
       }{}
     }
     when(exponentForced){
@@ -764,10 +795,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     }
 
     val norm = new Area{
-//      val needShift = math.mulC.msb
-//      val exp = math.exp + U(needShift)
-//      val man = needShift ? math.mulC(p.internalMantissaSize + 1, p.internalMantissaSize bits) | math.mulC(p.internalMantissaSize, p.internalMantissaSize bits)
-
       val (mulHigh, mulLow) = math.mulC.splitAt(p.internalMantissaSize-1)
       val scrap = mulLow =/= 0
       val needShift = mulHigh.msb
@@ -775,7 +802,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
       scrap setWhen(needShift && mulHigh(0))
       val forceZero = input.rs1.isZero || input.rs2.isZero
-      val forceUnderflow = exp <  exponentOne + exponentOne - 127 - 24  // 0x6A //TODO
+      val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24)
+      val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25)
+      val forceUnderflow = exp <  underflowThreshold
       val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity
       val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero))
       val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
@@ -797,7 +826,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       } elsewhen(forceZero) {
         output.setZero
       } elsewhen(forceUnderflow) {
-        output.exponent := exponentOne - 127 - 25
+        output.exponent := underflowExp.resized
       }
 
     }
@@ -1123,11 +1152,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
   val round = new Area{
     val input = merge.commited.combStage
 
-    //TODO do not break NAN payload (seems already fine)
     val manAggregate = input.value.mantissa @@ input.scrap
-    val expDif = (exponentOne-126) -^ input.value.exponent
+    val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal+1)(exponentF32Subnormal+1)
+    val expDif = expBase -^ input.value.exponent
     val expSubnormal = !expDif.msb
-    val discardCount = expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) |  U(0)
+    var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) |  U(0))
+    if(p.withDouble) when(input.format === FpuFormat.FLOAT){
+      discardCount \= discardCount + 29
+    }
     val exactMask = (List(True) ++ (0 until p.internalMantissaSize+1).map(_ < discardCount)).asBits.asUInt
     val roundAdjusted = (True ## (manAggregate>>1))(discardCount) ## ((manAggregate & exactMask) =/= 0)
 
@@ -1156,10 +1188,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 //      uf := True
 //    }
 
-    when(!math.special && math.exponent <= exponentOne-127 && roundAdjusted.asUInt =/= 0){ //Do not catch exact 1.17549435E-38 underflow, but, who realy care ?
+
+
+    val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
+    val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
+    val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1)
+
+    when(!math.special && math.exponent <= ufSubnormalThreshold && roundAdjusted.asUInt =/= 0){ //Do not catch exact 1.17549435E-38 underflow, but, who realy care ?
       uf := True
     }
-    when(!math.special && math.exponent >= exponentOne + 128){
+    when(!math.special && math.exponent > ofThreshold){
       nx := True
       of := True
       val doMax = input.roundMode.mux(
@@ -1170,7 +1208,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
         FpuRoundMode.RMM -> (False)
       )
       when(doMax){
-        patched.exponent := exponentOne + 127
+        patched.exponent := ofThreshold
         patched.mantissa.setAll()
       } otherwise {
         patched.setInfinity
@@ -1178,7 +1216,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     }
 
 
-    when(!math.special && math.exponent <= exponentOne - 127-23){
+    when(!math.special && math.exponent < ufThreshold){
       nx := True
       uf := True
       val doMin = input.roundMode.mux(
@@ -1189,7 +1227,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
         FpuRoundMode.RMM -> (False)
       )
       when(doMin){
-        patched.exponent := exponentOne - 127-23+1
+        patched.exponent := ufThreshold.resized
         patched.mantissa := 0
       } otherwise {
         patched.setZero
diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
index 6d5b495..79ea6b8 100644
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@@ -23,7 +23,17 @@ import scala.util.Random
 class FpuTest extends FunSuite{
 
   val b2f = lang.Float.intBitsToFloat(_)
+  val b2d = lang.Double.longBitsToDouble(_)
   val f2b = lang.Float.floatToRawIntBits(_)
+  val d2bOffset = BigInt("10000000000000000",16)
+  def d2b(that : Double) = {
+    val l = lang.Double.doubleToRawLongBits(that)
+    var a = BigInt(l)
+    if(l < 0) {
+      a = d2bOffset + a
+    }
+    a
+  }
 
 
   test("f32f64") {
@@ -64,24 +74,6 @@ class FpuTest extends FunSuite{
           def f32_f32_f32 ={
             val s = new Scanner(next)
             val a,b,c = (s.nextLong(16).toInt)
-//            if(b2f(a).isNaN ||  b2f(b).isNaN){
-//              print("NAN => ")
-//              if(((a >> 23) & 0xFF) == 0xFF && ((a >> 0) & 0xEFFFFF) != 0){
-//                print(a.toHexString)
-//                print(" " + f2b(b2f(a)).toHexString)
-//              }
-//              if(((b >> 23) & 0xFF) == 0xFF && ((b >> 0) & 0xEFFFFF) != 0){
-//                print(b.toHexString)
-//                print(" " + f2b(b2f(b)).toHexString)
-//              }
-//              if(((c >> 23) & 0xFF) == 0xFF && ((c >> 0) & 0xEFFFFF) != 0){
-//                print(" " + c.toHexString)
-//                print(" " + f2b(b2f(c)).toHexString)
-//              }
-//
-//              print(" " + simTime())
-//              println("")
-//            }
             (b2f(a), b2f(b), b2f(c), s.nextInt(16))
           }
 
@@ -105,8 +97,39 @@ class FpuTest extends FunSuite{
             val s = new Scanner(next)
             val a,b = (s.nextLong(16).toInt)
             (b2f(a), b2f(b), s.nextInt(16))
-        }
+          }
 
+
+          def nextLong(s : Scanner) : Long = java.lang.Long.parseUnsignedLong( s.next(),16)
+
+          def f64_f64_f64 ={
+            val s = new Scanner(next)
+            val a,b,c = nextLong(s)
+            (b2d(a), b2d(b), b2d(c), s.nextInt(16))
+          }
+
+          def i32_f64 ={
+            val s = new Scanner(next)
+            (s.nextLong(16).toInt, b2d(nextLong(s)), s.nextInt(16))
+          }
+
+          def f64_i32 = {
+            val s = new Scanner(next)
+            (b2d(nextLong(s)), s.nextLong(16).toInt, s.nextInt(16))
+          }
+
+          def f64_f64_i32 = {
+            val str = next
+            val s = new Scanner(str)
+            val a,b,c = (nextLong(s))
+            (b2d(a), b2d(b), c, s.nextInt(16))
+          }
+
+          def f64_f64 = {
+            val s = new Scanner(next)
+            val a,b = (s.nextLong(16))
+            (b2d(a), b2d(b), s.nextInt(16))
+          }
         }
         lazy val RAW = build("")
         lazy val RNE = build("-rnear_even")
@@ -125,28 +148,33 @@ class FpuTest extends FunSuite{
         }
       }
 
-      val f32 = new {
-        val add = new TestCase("f32_add")
-        val sub = new TestCase("f32_sub")
-        val mul = new TestCase("f32_mul")
-        val ui2f = new TestCase("ui32_to_f32")
-        val i2f = new TestCase("i32_to_f32")
-        val f2ui = new TestCase("f32_to_ui32 -exact")
-        val f2i = new TestCase("f32_to_i32 -exact")
-        val eq = new TestCase("f32_eq")
-        val lt = new TestCase("f32_lt")
-        val le = new TestCase("f32_le")
-        val min = new TestCase("f32_le")
-        val max = new TestCase("f32_lt")
-        val transfer = new TestCase("f32_eq")
-        val fclass = new TestCase("f32_eq")
-        val sgnj = new TestCase("f32_eq")
-        val sgnjn = new TestCase("f32_eq")
-        val sgnjx = new TestCase("f32_eq")
-        val sqrt = new TestCase("f32_sqrt")
-        val div = new TestCase("f32_div")
+      class TestVector(f : String) {
+        val add = new TestCase(s"${f}_add")
+        val sub = new TestCase(s"${f}_sub")
+        val mul = new TestCase(s"${f}_mul")
+        val ui2f = new TestCase(s"ui32_to_${f}")
+        val i2f = new TestCase(s"i32_to_${f}")
+        val f2ui = new TestCase(s"${f}_to_ui32 -exact")
+        val f2i = new TestCase(s"${f}_to_i32 -exact")
+        val eq = new TestCase(s"${f}_eq")
+        val lt = new TestCase(s"${f}_lt")
+        val le = new TestCase(s"${f}_le")
+        val min = new TestCase(s"${f}_le")
+        val max = new TestCase(s"${f}_lt")
+        val transfer = new TestCase(s"${f}_eq")
+        val fclass = new TestCase(s"${f}_eq")
+        val sgnj = new TestCase(s"${f}_eq")
+        val sgnjn = new TestCase(s"${f}_eq")
+        val sgnjx = new TestCase(s"${f}_eq")
+        val sqrt = new TestCase(s"${f}_sqrt")
+        val div = new TestCase(s"${f}_div")
+        val f32 = new TestCase(s"${f}_eq")
+        val f64 = new TestCase(s"${f}_eq")
       }
 
+      val f32 = new TestVector("f32")
+      val f64 = new TestVector("f64")
+
       val cpus = for(id <- 0 until portCount) yield new {
         val cmdQueue = mutable.Queue[FpuCmd => Unit]()
         val commitQueue = mutable.Queue[FpuCommit => Unit]()
@@ -165,9 +193,15 @@ class FpuTest extends FunSuite{
           val patch = if(value.abs == 1.17549435E-38f) ref & ~2 else ref
           flagMatch(patch, report)
         }
+
+        def flagMatch(ref : Int, value : Double, report : String): Unit ={
+          val patch = if(value.abs == b2d(1 << 52)) ref & ~2 else ref
+          flagMatch(patch, report)
+        }
+
         def flagMatch(ref : Int, report : String): Unit ={
           waitUntil(pendingMiaou == 0)
-          assert(flagAccumulator == ref, s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
+          softAssert(flagAccumulator == ref, s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
           flagAccumulator = 0
         }
         def flagClear(): Unit ={
@@ -231,6 +265,10 @@ class FpuTest extends FunSuite{
           loadRaw(rd, f2b(value).toLong & 0xFFFFFFFFl, FpuFormat.FLOAT)
         }
 
+        def load(rd : Int, value : Double): Unit ={
+          loadRaw(rd, d2b(value), FpuFormat.DOUBLE)
+        }
+
         def storeRaw(rs : Int, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
           cmdAdd {cmd =>
             cmd.opcode #= cmd.opcode.spinalEnum.STORE
@@ -250,8 +288,11 @@ class FpuTest extends FunSuite{
         def storeFloat(rs : Int)(body : Float => Unit): Unit ={
           storeRaw(rs, FpuFormat.FLOAT){rsp => body(b2f(rsp.value.toBigInt.toInt))}
         }
+        def store(rs : Int)(body : Double => Unit): Unit ={
+          storeRaw(rs, FpuFormat.DOUBLE){rsp => body(b2d(rsp.value.toBigInt.toLong))}
+        }
 
-        def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
           cmdAdd {cmd =>
             cmd.opcode #= opcode
             cmd.rs1 #= rs1
@@ -260,6 +301,7 @@ class FpuTest extends FunSuite{
             cmd.rd #= rd
             cmd.arg #= arg
             cmd.roundMode #= rounding
+            cmd.format #= format
           }
           commitQueue += {cmd =>
             cmd.write #= true
@@ -267,7 +309,7 @@ class FpuTest extends FunSuite{
           }
         }
 
-        def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
+        def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
           cmdAdd {cmd =>
             cmd.opcode #= opcode
             cmd.rs1 #= rs1
@@ -276,58 +318,59 @@ class FpuTest extends FunSuite{
             cmd.rd.randomize()
             cmd.arg #= arg
             cmd.roundMode #= rounding
+            cmd.format #= format
           }
           rspQueue += body
         }
 
 
-        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding)
+        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding, format)
         }
 
-        def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding)
+        def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding, format)
         }
 
-        def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding)
+        def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding, format)
         }
 
-        def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding)
+        def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding, format)
         }
 
-        def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding)
+        def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding, format)
         }
 
-        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
-          fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding)
+        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding, format)
         }
 
-        def sgnjRaw(rd : Int, rs1 : Int, rs2 : Int, arg : Int): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.SGNJ, arg, FpuRoundMode.elements.randomPick())
+        def sgnjRaw(rd : Int, rs1 : Int, rs2 : Int, arg : Int, format : FpuFormat.E): Unit ={
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.SGNJ, arg, FpuRoundMode.elements.randomPick(), format)
         }
 
-        def sgnj(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
-          sgnjRaw(rd, rs1, rs2, 0)
+        def sgnj(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
+          sgnjRaw(rd, rs1, rs2, 0, format)
         }
-        def sgnjn(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
-          sgnjRaw(rd, rs1, rs2, 1)
+        def sgnjn(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
+          sgnjRaw(rd, rs1, rs2, 1, format)
         }
-        def sgnjx(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
-          sgnjRaw(rd, rs1, rs2, 2)
+        def sgnjx(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
+          sgnjRaw(rd, rs1, rs2, 2, format)
         }
 
-        def cmp(rs1 : Int, rs2 : Int, arg : Int = 1)(body : FpuRsp => Unit): Unit ={
-          fpuF2i(rs1, rs2, FpuOpcode.CMP, arg, FpuRoundMode.elements.randomPick())(body)
+        def cmp(rs1 : Int, rs2 : Int, arg : Int, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
+          fpuF2i(rs1, rs2, FpuOpcode.CMP, arg, FpuRoundMode.elements.randomPick(), format)(body)
         }
 
-        def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
-          fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding)(body)
+        def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
+          fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding, format)(body)
         }
 
-        def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
+        def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
           cmdAdd {cmd =>
             cmd.opcode #= cmd.opcode.spinalEnum.I2F
             cmd.rs1.randomize()
@@ -336,6 +379,7 @@ class FpuTest extends FunSuite{
             cmd.rd #= rd
             cmd.arg #= (if(signed) 1 else 0)
             cmd.roundMode #= rounding
+            cmd.format #= format
           }
           commitQueue += {cmd =>
             cmd.write #= true
@@ -451,13 +495,13 @@ class FpuTest extends FunSuite{
         }
 
 
-        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
+        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
           val rs = new RegAllocator()
           val rs1, rs2, rs3 = rs.allocate()
           val rd = Random.nextInt(32)
           load(rs1, a)
           load(rs2, b)
-          op(rd,rs1,rs2, rounding)
+          op(rd,rs1,rs2, rounding, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             assert(f2b(v) == f2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
           }
@@ -466,12 +510,25 @@ class FpuTest extends FunSuite{
         }
 
 
+        def testBinaryOpF64(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Double, b : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          op(rd,rs1,rs2, rounding, FpuFormat.DOUBLE)
+          store(rd){v =>
+            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
+          }
 
-        def testTransferRaw(a : Float, iSrc : Boolean, iDst : Boolean): Unit ={
+          flagMatch(flag, ref, f"## ${opName} ${a} $b $ref $rounding")
+        }
+
+
+        def testTransferF32Raw(a : Float, iSrc : Boolean, iDst : Boolean): Unit ={
           val rd = Random.nextInt(32)
 
           def handle(v : Float): Unit ={
-            val refUnclamped = a
             val ref = a
             assert(f2b(v) == f2b(ref), f"$a = $v, $ref")
           }
@@ -482,6 +539,49 @@ class FpuTest extends FunSuite{
           flagMatch(0, f"$a")
         }
 
+
+        def testTransferF64Raw(a : Double): Unit ={
+          val rd = Random.nextInt(32)
+
+          def handle(v : Double): Unit ={
+            val ref = a
+            assert(d2b(v) == d2b(ref), f"$a = $v, $ref")
+          }
+
+          load(rd, a)
+          store(rd)(handle)
+
+          flagMatch(0, f"$a")
+        }
+
+        def testTransferF32F64Raw(a : Float,  iSrc : Boolean): Unit ={
+          val rd = Random.nextInt(32)
+          if(iSrc) fmv_w_x(rd, f2b(a)) else load(rd, a)
+          storeRaw(rd, FpuFormat.DOUBLE){rsp =>
+            val v = rsp.value.toBigInt.toLong
+            val ref = (0xFFFFFFFFl << 32) | f2b(a)
+            assert(v == ref, f"$a = $v, $ref")
+          }
+          flagMatch(0, f"$a")
+        }
+
+        def testTransferF64F32Raw(a : Double,  iDst : Boolean): Unit ={
+          val rd = Random.nextInt(32)
+          load(rd, a)
+          if(iDst)fmv_x_w(rd){v_ =>
+            val v = f2b(v_).toLong & 0xFFFFFFFFl
+            val ref = d2b(a) & 0xFFFFFFFFl
+            assert(v == ref, f"$a = $v, $ref")
+          }
+          else storeRaw(rd, FpuFormat.FLOAT){rsp =>
+            val v = rsp.value.toBigInt.toLong & 0xFFFFFFFFl
+            val ref = d2b(a) & 0xFFFFFFFFl
+            assert(v == ref, f"$a = $v, $ref")
+          }
+          flagMatch(0, f"$a")
+        }
+
+
         def testClassRaw(a : Float) : Unit = {
           val rd = Random.nextInt(32)
 
@@ -513,7 +613,7 @@ class FpuTest extends FunSuite{
           load(rs2, b)
           load(rs3, c)
 
-          fma(rd,rs1,rs2,rs3)
+          fma(rd,rs1,rs2,rs3, FpuRoundMode.RNE, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val ref = a.toDouble * b.toDouble + c.toDouble
             println(f"$a%.20f * $b%.20f + $c%.20f = $v%.20f, $ref%.20f")
@@ -530,7 +630,7 @@ class FpuTest extends FunSuite{
           load(rs1, a)
           load(rs2, b)
 
-          div(rd,rs1,rs2)
+          div(rd,rs1,rs2, FpuRoundMode.RNE, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val refUnclamped = a/b
             val refClamped = ((a)/(b))
@@ -547,7 +647,7 @@ class FpuTest extends FunSuite{
           val rd = Random.nextInt(32)
           load(rs1, a)
 
-          sqrt(rd,rs1)
+          sqrt(rd,rs1, FpuRoundMode.RNE, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val ref = Math.sqrt(a).toFloat
             val error = Math.abs(ref-v)/ref
@@ -564,7 +664,7 @@ class FpuTest extends FunSuite{
           val rd = Random.nextInt(32)
           load(rs1, a)
 
-          sqrt(rd,rs1)
+          sqrt(rd,rs1,  FpuRoundMode.RNE, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val error = Math.abs(ref-v)/ref
             println(f"sqrt($a) = $v, $ref $error $rounding")
@@ -579,7 +679,7 @@ class FpuTest extends FunSuite{
           load(rs1, a)
           load(rs2, b)
 
-          div(rd,rs1, rs2)
+          div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val error = Math.abs(ref-v)/ref
             println(f"div($a, $b) = $v, $ref $error $rounding")
@@ -594,16 +694,16 @@ class FpuTest extends FunSuite{
           val rs1 = rs.allocate()
           val rd = Random.nextInt(32)
           load(rs1, a)
-          f2i(rs1, signed, rounding){rsp =>
+          f2i(rs1, signed, rounding, FpuFormat.FLOAT){rsp =>
             if(signed) {
-              val v = rsp.value.toLong.toInt
+              val v = rsp.value.toBigInt.toInt
               var ref2 = ref
               if(a >= Int.MaxValue) ref2 = Int.MaxValue
               if(a <= Int.MinValue) ref2 = Int.MinValue
               if(a.isNaN) ref2 = Int.MaxValue
               assert(v == (ref2), f" <= f2i($a) = $v, $ref2, $rounding, $flag")
             } else {
-              val v = rsp.value.toLong
+              val v = rsp.value.toBigInt.toLong & 0xFFFFFFFFl
               var ref2 = ref.toLong & 0xFFFFFFFFl
               if(a < 0) ref2 = 0
               if(a >= 0xFFFFFFFFl) ref2 = 0xFFFFFFFFl
@@ -621,15 +721,15 @@ class FpuTest extends FunSuite{
         def testI2fExact(a : Int, b : Float, f : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
           val rs = new RegAllocator()
           val rd = Random.nextInt(32)
-          i2f(rd, a, signed, rounding)
+          i2f(rd, a, signed, rounding, FpuFormat.FLOAT)
           storeFloat(rd){v =>
             val aLong = if(signed) a.toLong else a.toLong & 0xFFFFFFFFl
             val ref = b
-            assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref")
+            assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref $rounding")
           }
 
 
-          flagMatch(f, b, f"i2f() = $b")
+          flagMatch(f, b, f"i2f($a) = $b")
         }
 
 
@@ -640,7 +740,7 @@ class FpuTest extends FunSuite{
           val rd = Random.nextInt(32)
           load(rs1, a)
           load(rs2, b)
-          cmp(rs1, rs2, arg){rsp =>
+          cmp(rs1, rs2, arg, FpuFormat.FLOAT){rsp =>
             val v = rsp.value.toLong
             assert(v === ref, f"cmp($a, $b, $arg) = $v, $ref")
           }
@@ -744,29 +844,6 @@ class FpuTest extends FunSuite{
           }
         }
 
-
-
-//        for(i <- 0 until 64){
-//          val rounding = FpuRoundMode.RMM
-//          val a = 24f
-//          val b = b2f(0x3f800000+i)
-//          val c = Clib.math.mulF32(a, b, rounding.position)
-//          val f = 0
-//          testMulExact(a,b,c,f, rounding)
-//        }
-
-        val binaryOps = List[(Int,Int,Int,FpuRoundMode.E) => Unit](add, sub, mul)
-
-//        testSqrt(0.0f)
-        //        testSqrt(1.2f)
-        //        for(a <- fAll) testSqrt(a)
-//        for(_ <- 0 until 1000) testSqrt(randomFloat())
-
-
-
-
-
-
         def testFma() : Unit = {
           testFmaRaw(randomFloat(), randomFloat(), randomFloat())
           flagClear()
@@ -786,13 +863,13 @@ class FpuTest extends FunSuite{
           testEqRaw(a,b,i, f)
         }
 
-        def testF2ui() : Unit = {
+        def testF2uiF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.f2ui(rounding).f32_i32
           testF2iExact(a,b, f, false, rounding)
         }
 
-        def testF2i() : Unit = {
+        def testF2iF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.f2i(rounding).f32_i32
           testF2iExact(a,b, f, true, rounding)
@@ -823,11 +900,26 @@ class FpuTest extends FunSuite{
           testSgnjxRaw(a, b)
         }
 
-        def testTransfer() : Unit = {
+        def testTransferF32() : Unit = {
           val (a,b,r,f) = f32.transfer.RAW.f32_f32_i32
-          testTransferRaw(a, Random.nextBoolean(), Random.nextBoolean())
+          testTransferF32Raw(a, Random.nextBoolean(), Random.nextBoolean())
         }
 
+        def testTransferF64() : Unit = {
+          val (a,b,r,f) = f64.transfer.RAW.f64_f64_i32
+          testTransferF64Raw(a)
+        }
+
+        def testTransferF64F32() : Unit = {
+          val (a,b,r,f) = f64.f32.RAW.f64_f64_i32
+          testTransferF64F32Raw(a, Random.nextBoolean())
+        }
+        def testTransferF32F64() : Unit = {
+          val (a,b,r,f) = f32.f64.RAW.f32_f32_i32
+          testTransferF32F64Raw(a, Random.nextBoolean())
+        }
+
+
         def testClass() : Unit = {
           val (a,b,r,f) = f32.fclass.RAW.f32_f32_i32
           testClassRaw(a)
@@ -854,59 +946,112 @@ class FpuTest extends FunSuite{
           testI2fExact(a,b,f, false, rounding)
         }
 
-        def testMul() : Unit = {
+        def testMulF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,c,f) = f32.mul(rounding).f32_f32_f32
           testBinaryOp(mul,a,b,c,f, rounding,"mul")
         }
 
-        def testAdd() : Unit = {
+        def testAddF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,c,f) = f32.add(rounding).f32_f32_f32
           testBinaryOp(add,a,b,c,f, rounding,"add")
         }
 
-        def testSub() : Unit = {
+        def testSubF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,c,f) = f32.sub(rounding).f32_f32_f32
           testBinaryOp(sub,a,b,c,f, rounding,"sub")
         }
 
+        def testMulF64() : Unit = {
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,c,f) = f64.mul(rounding).f64_f64_f64
+          testBinaryOpF64(mul,a,b,c,f, rounding,"mul")
+        }
 
-        val f32Tests = List[() => Unit](testSub, testAdd, testMul, testI2f, testUI2f, testMin, testMax, testSgnj, testTransfer, testDiv, testSqrt, testF2i, testF2ui, testLe, testEq, testLt, testClass, testFma)
+
+        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f, testUI2f, testMin, testMax, testSgnj, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLe, testEq, testLt, testClass, testFma)
+
+        //TODO test boxing
+        if(p.withDouble) {
+//          for(_ <- 0 until 10000) testUI2f64()
+//          for(_ <- 0 until 10000) testI2f64()
+//          println("f64 i2f done")
+//
+//          for(_ <- 0 until 10000) testF2uiF64()
+//          for(_ <- 0 until 10000) testF2iF64()
+//          println("f64 f2i done")
+
+//          testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
+//          testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
+//          testF2iExact(2.5f,2, 1, false, FpuRoundMode.RTZ)
 
 
 
 
+          testBinaryOpF64(mul,1.0, 1.0, 1.0,0 , FpuRoundMode.RNE,"mul")
+          testBinaryOpF64(mul,1.0, 2.0, 2.0,0 , FpuRoundMode.RNE,"mul")
+          testBinaryOpF64(mul,2.5, 2.0, 5.0,0 , FpuRoundMode.RNE,"mul")
 
-        testTransferRaw(1.0f, false, false)
-        testTransferRaw(2.0f, false, false)
-        testTransferRaw(2.5f, false, false)
-        testTransferRaw(6.97949770801e-39f, false, false)
-        testTransferRaw(8.72437213501e-40f, false, false)
-        testTransferRaw(5.6E-45f, false, false)
+          for(_ <- 0 until 10000) testMulF64()
+          println("f64 Mul done")
 
+          testTransferF64Raw(1.0)
+          testTransferF64Raw(2.0)
+          testTransferF64Raw(2.5)
+          testTransferF64Raw(6.97949770801e-39)
+          testTransferF64Raw(8.72437213501e-40)
+          testTransferF64Raw(5.6E-45)
 
+          testTransferF32F64Raw(b2f(0xFFFF1234), false)
+          testTransferF64F32Raw(b2d(0xFFF123498765463l << 4), false)
+          testTransferF32F64Raw(b2f(0xFFFF1234), true)
+          testTransferF64F32Raw(b2d(0xFFF123498765463l << 4), true)
 
+          for (_ <- 0 until 10000) testTransferF64()
+          println("f64 load/store/rf transfer done")
 
+          for (_ <- 0 until 10000) testTransferF64F32()
+          println("f64 -> f32 load/store/rf transfer done")
 
-        for(_ <- 0 until 10000) testTransfer()
+          for (_ <- 0 until 10000) testTransferF32F64()
+          println("f32 -> f64 load/store/rf transfer done")
+
+        }
+
+        for(_ <- 0 until 10000) testTransferF32()
         println("f32 load/store/rf transfer done")
 
-        for(_ <- 0 until 10000) testF2ui()
-        for(_ <- 0 until 10000) testF2i()
-        println("f2i done")
+        for(_ <- 0 until 10000) testMulF32()
+        println("Mul done")
+
 
         for(_ <- 0 until 10000) testUI2f()
         for(_ <- 0 until 10000) testI2f()
         println("i2f done")
 
 
+        testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
+        testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
+        testF2iExact(2.5f,2, 1, false, FpuRoundMode.RTZ)
+
+
+
+
+
+        for(_ <- 0 until 10000) testF2uiF32()
+        for(_ <- 0 until 10000) testF2iF32()
+        println("f2i done")
+
+
+
 //        waitUntil(cmdQueue.isEmpty)
 //        dut.clockDomain.waitSampling(1000)
 //        simSuccess()
 
 
+
         for(i <- 0 until 1000) testFma()
         flagClear()
         println("fma done") //TODO
@@ -959,14 +1104,11 @@ class FpuTest extends FunSuite{
 
 
 
-        for(_ <- 0 until 10000) testMul()
-
-        println("Mul done")
 
 
 
-        for(_ <- 0 until 10000) testAdd()
-        for(_ <- 0 until 10000) testSub()
+        for(_ <- 0 until 10000) testAddF32()
+        for(_ <- 0 until 10000) testSubF32()
 
         println("Add done")