fpu f64 load/store/mv/mul seems ok

2025-01-03 03:43:39 -05:00 · 2021-02-11 16:07:47 +01:00 · 2021-02-11 16:07:47 +01:00 · b6eda1ad7a
commit b6eda1ad7a
parent e97c2de837
2 changed files with 346 additions and 166 deletions
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@ -21,6 +21,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
  val exponentOne = (1 << p.internalExponentSize-1) - 1
  val exponentF32Subnormal = exponentOne-127
  val exponentF64Subnormal = exponentOne-1023
  val exponentF32Infinity = exponentOne+127+1
  val exponentF64Infinity = exponentOne+1023+1
  val rfLockCount = 5
  val lockIdType = HardType(UInt(log2Up(rfLockCount) bits))
@ -30,6 +32,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    if(!p.withDouble) no
  }
  def muxDouble[T <: Data](format : FpuFormat.C)(yes : => T)(no : => T): T ={
    if(p.withDouble) ((format === FpuFormat.DOUBLE) ? { yes } | { no })
    else no
  }
  case class RfReadInput() extends Bundle{
    val source = Source()
    val opcode = p.Opcode()
@ -254,11 +261,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    output.rs3 := rs3Entry.value
    if(p.withDouble){
      output.format := s1.format
-      when(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
+      val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
      when(store){ //Pass through
        output.format := rs1Entry.boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
      } elsewhen(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
        output.rs1.setNanQuiet
        output.rs1.sign := False
      }
      when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed){
        output.rs2.setNanQuiet
        output.rs2.sign := False
      }
      when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed){
        output.rs3.setNanQuiet
@ -364,7 +376,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      output.i2f := input.i2f
      output.arg := input.arg
      output.roundMode := input.roundMode
-      if(p.withDouble) output.format := input.format
+      if(p.withDouble) {
        output.format := input.format
        when(!input.i2f && input.format === FpuFormat.DOUBLE && output.value(63 downto 32).andR){ //Detect boxing
          output.format := FpuFormat.FLOAT
        }
      }
    }
    val s1 = new Area{
@ -378,25 +396,34 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      }
      val f64 = p.withDouble generate new Area{
        val mantissa = input.value(0, 52 bits).asUInt
-        val exponent = input.value(11, 52 bits).asUInt
+        val exponent = input.value(52, 11 bits).asUInt
        val sign     = input.value(63)
      }
      val recodedExpOffset = UInt(p.internalExponentSize bits)
      val passThroughFloat = p.internalFloating()
      passThroughFloat.special := False
-      passThroughFloat.sign := f32.sign
+
-      passThroughFloat.exponent := f32.exponent.resized
+      whenDouble(input.format){
      passThroughFloat.mantissa := f32.mantissa << (if(p.withDouble) 29 else 0)
      if(p.withDouble) when(input.format === FpuFormat.DOUBLE){
        passThroughFloat.sign := f64.sign
        passThroughFloat.exponent := f64.exponent.resized
        passThroughFloat.mantissa := f64.mantissa
        recodedExpOffset := exponentF64Subnormal
      } {
        passThroughFloat.sign := f32.sign
        passThroughFloat.exponent := f32.exponent.resized
        passThroughFloat.mantissa := f32.mantissa << (if (p.withDouble) 29 else 0)
        recodedExpOffset := exponentF32Subnormal
      }
      val manZero = passThroughFloat.mantissa === 0
      val expZero = passThroughFloat.exponent === 0
      val expOne =  passThroughFloat.exponent(7 downto 0).andR
-      if(p.withDouble) expOne.clearWhen(input.format === FpuFormat.DOUBLE && !passThroughFloat.exponent(11 downto 8).andR)
+      if(p.withDouble) {
        expZero.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 0)
        expOne.clearWhen(input.format === FpuFormat.DOUBLE && input.value(62 downto 60) =/= 7)
      }
      val isZero      =  expZero &&  manZero
      val isSubnormal =  expZero && !manZero
@ -409,9 +436,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
        val ohInputWidth = 32 max p.internalMantissaSize
        val ohInput = Bits(ohInputWidth bits).assignDontCare()
        when(!input.i2f) {
-          if(!p.withDouble) ohInput(ohInputWidth-23, 23 bits) := input.value(0, 23 bits)
+          if(!p.withDouble) ohInput := input.value(0, 23 bits) << 9
          if( p.withDouble) ohInput := passThroughFloat.mantissa.asBits
        } otherwise {
          ohInput(ohInputWidth-32-1 downto 0) := 0
          ohInput(ohInputWidth-32, 32 bits) := input.value(31 downto 0)
        }
@ -426,15 +454,15 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
          }
          val output = RegNextWhen(logic, !done)
        }
-        shift.input := (input.value.asUInt |<< 1).resized
+        shift.input := (ohInput.asUInt |<< 1).resized
-        val subnormalShiftOffset = if(!p.withDouble) U(9) else ((input.format === FpuFormat.DOUBLE) ? U(0) | U(0))
+        val subnormalShiftOffset = if(!p.withDouble) U(0) else ((input.format === FpuFormat.DOUBLE) ? U(0) | U(0)) //TODO remove ?
-        val subnormalExpOffset = if(!p.withDouble) U(9) else ((input.format === FpuFormat.DOUBLE)   ? U(0) | U(0))
+        val subnormalExpOffset = if(!p.withDouble) U(0) else ((input.format === FpuFormat.DOUBLE)   ? U(0) | U(0))
        when(input.valid && (input.i2f || isSubnormal) && !done){
          busy := True
          when(boot){
-            when(input.i2f && !patched && input.value.msb && input.arg(0)){
+            when(input.i2f && !patched && input.value(31) && input.arg(0)){
              input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits))
              patched := True
            } otherwise {
@ -467,7 +495,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val recoded = p.internalFloating()
      recoded.mantissa := passThroughFloat.mantissa
-      recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + exponentF32Subnormal).resized
+      recoded.exponent := (passThroughFloat.exponent -^ fsm.expOffset + recodedExpOffset).resized
      recoded.sign     := passThroughFloat.sign
      recoded.setNormal
      when(isZero){recoded.setZero}
@ -480,9 +508,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      output.roundMode := input.roundMode
      if(p.withDouble) {
        output.format := input.format
        when(!input.i2f && input.format === FpuFormat.DOUBLE && input.value(63 downto 23).andR){ //Detect boxing
          output.format := FpuFormat.FLOAT
        }
      }
      output.rd := input.rd
      output.value.sign      := recoded.sign
@ -523,9 +548,15 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val exp = (input.rs1.exponent - (exponentOne-1023)).resize(11 bits)
      val man = CombInit(input.rs1.mantissa)
    }
    recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
-    val expInSubnormalRange = input.rs1.exponent <= exponentOne - 127
+    whenDouble(input.format){
      recodedResult := input.rs1.sign ## f64.exp ## f64.man
    } {
      recodedResult := (if(p.withDouble) B"xFFFFFFFF" else B"") ## input.rs1.sign ## f32.exp ## f32.man
    }
    val expSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
    val expInSubnormalRange = input.rs1.exponent <= expSubnormalThreshold
    val isSubnormal = !input.rs1.special && expInSubnormalRange
    val isNormal = !input.rs1.special && !expInSubnormalRange
    val fsm = new Area{
@ -552,14 +583,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      shift.input := (U(!isZero) @@ input.rs1.mantissa) << (if(p.withDouble) 0 else 9)
-
+      val formatShiftOffset = muxDouble[UInt](input.format)(exponentOne-1023+1)(exponentOne - (if(p.withDouble) (127+34) else (127-10)))
      when(input.valid && (needRecoding || isF2i) && !done){
        halt := True
        when(boot){
          when(isF2i){
-            shift.by := (U(exponentOne + 31) - input.rs1.exponent).min(U(33)).resized //TODO merge
+            shift.by := ((U(exponentOne + 31) - input.rs1.exponent).min(U(33)) + (if(p.withDouble) 20 else 0)).resized //TODO merge
          } otherwise {
-            shift.by := (U(exponentOne - 127+10) - input.rs1.exponent).resized
+            shift.by := (formatShiftOffset - input.rs1.exponent).resized
          }
          boot := False
        } otherwise {
@ -619,7 +650,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    when(mantissaForced){
      recodedResult(0,23 bits) := (default -> mantissaForcedValue)
      whenDouble(input.format){
-        recodedResult(52-23, 52-23 bits) := (default -> exponentForcedValue)
+        recodedResult(23, 52-23 bits) := (default -> mantissaForcedValue)
      }{}
    }
    when(exponentForced){
@ -764,10 +795,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    }
    val norm = new Area{
 //      val needShift = math.mulC.msb
 //      val exp = math.exp + U(needShift)
 //      val man = needShift ? math.mulC(p.internalMantissaSize + 1, p.internalMantissaSize bits) | math.mulC(p.internalMantissaSize, p.internalMantissaSize bits)
      val (mulHigh, mulLow) = math.mulC.splitAt(p.internalMantissaSize-1)
      val scrap = mulLow =/= 0
      val needShift = mulHigh.msb
@ -775,7 +802,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
      scrap setWhen(needShift && mulHigh(0))
      val forceZero = input.rs1.isZero || input.rs2.isZero
-      val forceUnderflow = exp <  exponentOne + exponentOne - 127 - 24  // 0x6A //TODO
+      val underflowThreshold = muxDouble[UInt](input.format)(exponentOne + exponentOne - 1023 - 53) (exponentOne + exponentOne - 127 - 24)
      val underflowExp = muxDouble[UInt](input.format)(exponentOne - 1023 - 54) (exponentOne - 127 - 25)
      val forceUnderflow = exp <  underflowThreshold
      val forceOverflow = input.rs1.isInfinity || input.rs2.isInfinity
      val infinitynan = ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero))
      val forceNan = input.rs1.isNan || input.rs2.isNan || infinitynan
@ -797,7 +826,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      } elsewhen(forceZero) {
        output.setZero
      } elsewhen(forceUnderflow) {
-        output.exponent := exponentOne - 127 - 25
+        output.exponent := underflowExp.resized
      }
    }
@ -1123,11 +1152,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
  val round = new Area{
    val input = merge.commited.combStage
    //TODO do not break NAN payload (seems already fine)
    val manAggregate = input.value.mantissa @@ input.scrap
-    val expDif = (exponentOne-126) -^ input.value.exponent
+    val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal+1)(exponentF32Subnormal+1)
    val expDif = expBase -^ input.value.exponent
    val expSubnormal = !expDif.msb
-    val discardCount = expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) |  U(0)
+    var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) |  U(0))
    if(p.withDouble) when(input.format === FpuFormat.FLOAT){
      discardCount \= discardCount + 29
    }
    val exactMask = (List(True) ++ (0 until p.internalMantissaSize+1).map(_ < discardCount)).asBits.asUInt
    val roundAdjusted = (True ## (manAggregate>>1))(discardCount) ## ((manAggregate & exactMask) =/= 0)
@ -1156,10 +1188,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 //      uf := True
 //    }
-    when(!math.special && math.exponent <= exponentOne-127 && roundAdjusted.asUInt =/= 0){ //Do not catch exact 1.17549435E-38 underflow, but, who realy care ?
+
    val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
    val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
    val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1)
    when(!math.special && math.exponent <= ufSubnormalThreshold && roundAdjusted.asUInt =/= 0){ //Do not catch exact 1.17549435E-38 underflow, but, who realy care ?
      uf := True
    }
-    when(!math.special && math.exponent >= exponentOne + 128){
+    when(!math.special && math.exponent > ofThreshold){
      nx := True
      of := True
      val doMax = input.roundMode.mux(
@ -1170,7 +1208,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
        FpuRoundMode.RMM -> (False)
      )
      when(doMax){
-        patched.exponent := exponentOne + 127
+        patched.exponent := ofThreshold
        patched.mantissa.setAll()
      } otherwise {
        patched.setInfinity
@ -1178,7 +1216,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    }
-    when(!math.special && math.exponent <= exponentOne - 127-23){
+    when(!math.special && math.exponent < ufThreshold){
      nx := True
      uf := True
      val doMin = input.roundMode.mux(
@ -1189,7 +1227,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
        FpuRoundMode.RMM -> (False)
      )
      when(doMin){
-        patched.exponent := exponentOne - 127-23+1
+        patched.exponent := ufThreshold.resized
        patched.mantissa := 0
      } otherwise {
        patched.setZero
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@ -23,7 +23,17 @@ import scala.util.Random
 class FpuTest extends FunSuite{
  val b2f = lang.Float.intBitsToFloat(_)
  val b2d = lang.Double.longBitsToDouble(_)
  val f2b = lang.Float.floatToRawIntBits(_)
  val d2bOffset = BigInt("10000000000000000",16)
  def d2b(that : Double) = {
    val l = lang.Double.doubleToRawLongBits(that)
    var a = BigInt(l)
    if(l < 0) {
      a = d2bOffset + a
    }
    a
  }
  test("f32f64") {
@ -64,24 +74,6 @@ class FpuTest extends FunSuite{
          def f32_f32_f32 ={
            val s = new Scanner(next)
            val a,b,c = (s.nextLong(16).toInt)
 //            if(b2f(a).isNaN ||  b2f(b).isNaN){
 //              print("NAN => ")
 //              if(((a >> 23) & 0xFF) == 0xFF && ((a >> 0) & 0xEFFFFF) != 0){
 //                print(a.toHexString)
 //                print(" " + f2b(b2f(a)).toHexString)
 //              }
 //              if(((b >> 23) & 0xFF) == 0xFF && ((b >> 0) & 0xEFFFFF) != 0){
 //                print(b.toHexString)
 //                print(" " + f2b(b2f(b)).toHexString)
 //              }
 //              if(((c >> 23) & 0xFF) == 0xFF && ((c >> 0) & 0xEFFFFF) != 0){
 //                print(" " + c.toHexString)
 //                print(" " + f2b(b2f(c)).toHexString)
 //              }
 //
 //              print(" " + simTime())
 //              println("")
 //            }
            (b2f(a), b2f(b), b2f(c), s.nextInt(16))
          }
@ -105,8 +97,39 @@ class FpuTest extends FunSuite{
            val s = new Scanner(next)
            val a,b = (s.nextLong(16).toInt)
            (b2f(a), b2f(b), s.nextInt(16))
-        }
+          }
          def nextLong(s : Scanner) : Long = java.lang.Long.parseUnsignedLong( s.next(),16)
          def f64_f64_f64 ={
            val s = new Scanner(next)
            val a,b,c = nextLong(s)
            (b2d(a), b2d(b), b2d(c), s.nextInt(16))
          }
          def i32_f64 ={
            val s = new Scanner(next)
            (s.nextLong(16).toInt, b2d(nextLong(s)), s.nextInt(16))
          }
          def f64_i32 = {
            val s = new Scanner(next)
            (b2d(nextLong(s)), s.nextLong(16).toInt, s.nextInt(16))
          }
          def f64_f64_i32 = {
            val str = next
            val s = new Scanner(str)
            val a,b,c = (nextLong(s))
            (b2d(a), b2d(b), c, s.nextInt(16))
          }
          def f64_f64 = {
            val s = new Scanner(next)
            val a,b = (s.nextLong(16))
            (b2d(a), b2d(b), s.nextInt(16))
          }
        }
        lazy val RAW = build("")
        lazy val RNE = build("-rnear_even")
@ -125,28 +148,33 @@ class FpuTest extends FunSuite{
        }
      }
-      val f32 = new {
+      class TestVector(f : String) {
-        val add = new TestCase("f32_add")
+        val add = new TestCase(s"${f}_add")
-        val sub = new TestCase("f32_sub")
+        val sub = new TestCase(s"${f}_sub")
-        val mul = new TestCase("f32_mul")
+        val mul = new TestCase(s"${f}_mul")
-        val ui2f = new TestCase("ui32_to_f32")
+        val ui2f = new TestCase(s"ui32_to_${f}")
-        val i2f = new TestCase("i32_to_f32")
+        val i2f = new TestCase(s"i32_to_${f}")
-        val f2ui = new TestCase("f32_to_ui32 -exact")
+        val f2ui = new TestCase(s"${f}_to_ui32 -exact")
-        val f2i = new TestCase("f32_to_i32 -exact")
+        val f2i = new TestCase(s"${f}_to_i32 -exact")
-        val eq = new TestCase("f32_eq")
+        val eq = new TestCase(s"${f}_eq")
-        val lt = new TestCase("f32_lt")
+        val lt = new TestCase(s"${f}_lt")
-        val le = new TestCase("f32_le")
+        val le = new TestCase(s"${f}_le")
-        val min = new TestCase("f32_le")
+        val min = new TestCase(s"${f}_le")
-        val max = new TestCase("f32_lt")
+        val max = new TestCase(s"${f}_lt")
-        val transfer = new TestCase("f32_eq")
+        val transfer = new TestCase(s"${f}_eq")
-        val fclass = new TestCase("f32_eq")
+        val fclass = new TestCase(s"${f}_eq")
-        val sgnj = new TestCase("f32_eq")
+        val sgnj = new TestCase(s"${f}_eq")
-        val sgnjn = new TestCase("f32_eq")
+        val sgnjn = new TestCase(s"${f}_eq")
-        val sgnjx = new TestCase("f32_eq")
+        val sgnjx = new TestCase(s"${f}_eq")
-        val sqrt = new TestCase("f32_sqrt")
+        val sqrt = new TestCase(s"${f}_sqrt")
-        val div = new TestCase("f32_div")
+        val div = new TestCase(s"${f}_div")
        val f32 = new TestCase(s"${f}_eq")
        val f64 = new TestCase(s"${f}_eq")
      }
      val f32 = new TestVector("f32")
      val f64 = new TestVector("f64")
      val cpus = for(id <- 0 until portCount) yield new {
        val cmdQueue = mutable.Queue[FpuCmd => Unit]()
        val commitQueue = mutable.Queue[FpuCommit => Unit]()
@ -165,9 +193,15 @@ class FpuTest extends FunSuite{
          val patch = if(value.abs == 1.17549435E-38f) ref & ~2 else ref
          flagMatch(patch, report)
        }
        def flagMatch(ref : Int, value : Double, report : String): Unit ={
          val patch = if(value.abs == b2d(1 << 52)) ref & ~2 else ref
          flagMatch(patch, report)
        }
        def flagMatch(ref : Int, report : String): Unit ={
          waitUntil(pendingMiaou == 0)
-          assert(flagAccumulator == ref, s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
+          softAssert(flagAccumulator == ref, s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
          flagAccumulator = 0
        }
        def flagClear(): Unit ={
@ -231,6 +265,10 @@ class FpuTest extends FunSuite{
          loadRaw(rd, f2b(value).toLong & 0xFFFFFFFFl, FpuFormat.FLOAT)
        }
        def load(rd : Int, value : Double): Unit ={
          loadRaw(rd, d2b(value), FpuFormat.DOUBLE)
        }
        def storeRaw(rs : Int, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
          cmdAdd {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.STORE
@ -250,8 +288,11 @@ class FpuTest extends FunSuite{
        def storeFloat(rs : Int)(body : Float => Unit): Unit ={
          storeRaw(rs, FpuFormat.FLOAT){rsp => body(b2f(rsp.value.toBigInt.toInt))}
        }
        def store(rs : Int)(body : Double => Unit): Unit ={
          storeRaw(rs, FpuFormat.DOUBLE){rsp => body(b2d(rsp.value.toBigInt.toLong))}
        }
-        def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def fpuF2f(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
          cmdAdd {cmd =>
            cmd.opcode #= opcode
            cmd.rs1 #= rs1
@ -260,6 +301,7 @@ class FpuTest extends FunSuite{
            cmd.rd #= rd
            cmd.arg #= arg
            cmd.roundMode #= rounding
            cmd.format #= format
          }
          commitQueue += {cmd =>
            cmd.write #= true
@ -267,7 +309,7 @@ class FpuTest extends FunSuite{
          }
        }
-        def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
+        def fpuF2i(rs1 : Int, rs2 : Int, opcode : FpuOpcode.E, arg : Int, rounding : FpuRoundMode.E, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
          cmdAdd {cmd =>
            cmd.opcode #= opcode
            cmd.rs1 #= rs1
@ -276,58 +318,59 @@ class FpuTest extends FunSuite{
            cmd.rd.randomize()
            cmd.arg #= arg
            cmd.roundMode #= rounding
            cmd.format #= format
          }
          rspQueue += body
        }
-        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def mul(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding)
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.MUL, 0, rounding, format)
        }
-        def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding)
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 0, rounding, format)
        }
-        def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def sub(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding)
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.ADD, 1, rounding, format)
        }
-        def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def div(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding)
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.DIV, Random.nextInt(4), rounding, format)
        }
-        def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def sqrt(rd : Int, rs1 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding)
+          fpuF2f(rd, rs1, Random.nextInt(32), Random.nextInt(32), FpuOpcode.SQRT, Random.nextInt(4), rounding, format)
        }
-        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
+        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding)
+          fpuF2f(rd, rs1, rs2, rs3, FpuOpcode.FMA, 0, rounding, format)
        }
-        def sgnjRaw(rd : Int, rs1 : Int, rs2 : Int, arg : Int): Unit ={
+        def sgnjRaw(rd : Int, rs1 : Int, rs2 : Int, arg : Int, format : FpuFormat.E): Unit ={
-          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.SGNJ, arg, FpuRoundMode.elements.randomPick())
+          fpuF2f(rd, rs1, rs2, Random.nextInt(32), FpuOpcode.SGNJ, arg, FpuRoundMode.elements.randomPick(), format)
        }
-        def sgnj(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
+        def sgnj(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
-          sgnjRaw(rd, rs1, rs2, 0)
+          sgnjRaw(rd, rs1, rs2, 0, format)
        }
-        def sgnjn(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
+        def sgnjn(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
-          sgnjRaw(rd, rs1, rs2, 1)
+          sgnjRaw(rd, rs1, rs2, 1, format)
        }
-        def sgnjx(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null): Unit ={
+        def sgnjx(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = null, format : FpuFormat.E): Unit ={
-          sgnjRaw(rd, rs1, rs2, 2)
+          sgnjRaw(rd, rs1, rs2, 2, format)
        }
-        def cmp(rs1 : Int, rs2 : Int, arg : Int = 1)(body : FpuRsp => Unit): Unit ={
+        def cmp(rs1 : Int, rs2 : Int, arg : Int, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
-          fpuF2i(rs1, rs2, FpuOpcode.CMP, arg, FpuRoundMode.elements.randomPick())(body)
+          fpuF2i(rs1, rs2, FpuOpcode.CMP, arg, FpuRoundMode.elements.randomPick(), format)(body)
        }
-        def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E = FpuRoundMode.RNE)(body : FpuRsp => Unit): Unit ={
+        def f2i(rs1 : Int, signed : Boolean, rounding : FpuRoundMode.E, format : FpuFormat.E)(body : FpuRsp => Unit): Unit ={
-          fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding)(body)
+          fpuF2i(rs1, Random.nextInt(32), FpuOpcode.F2I, if(signed) 1 else 0, rounding, format)(body)
        }
-        def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
+        def i2f(rd : Int, value : Int, signed : Boolean, rounding : FpuRoundMode.E, format : FpuFormat.E): Unit ={
          cmdAdd {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.I2F
            cmd.rs1.randomize()
@ -336,6 +379,7 @@ class FpuTest extends FunSuite{
            cmd.rd #= rd
            cmd.arg #= (if(signed) 1 else 0)
            cmd.roundMode #= rounding
            cmd.format #= format
          }
          commitQueue += {cmd =>
            cmd.write #= true
@ -451,13 +495,13 @@ class FpuTest extends FunSuite{
        }
-        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
+        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
          val rs = new RegAllocator()
          val rs1, rs2, rs3 = rs.allocate()
          val rd = Random.nextInt(32)
          load(rs1, a)
          load(rs2, b)
-          op(rd,rs1,rs2, rounding)
+          op(rd,rs1,rs2, rounding, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            assert(f2b(v) == f2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
          }
@ -466,12 +510,25 @@ class FpuTest extends FunSuite{
        }
        def testBinaryOpF64(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Double, b : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
          val rs = new RegAllocator()
          val rs1, rs2, rs3 = rs.allocate()
          val rd = Random.nextInt(32)
          load(rs1, a)
          load(rs2, b)
          op(rd,rs1,rs2, rounding, FpuFormat.DOUBLE)
          store(rd){v =>
            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
          }
-        def testTransferRaw(a : Float, iSrc : Boolean, iDst : Boolean): Unit ={
+          flagMatch(flag, ref, f"## ${opName} ${a} $b $ref $rounding")
        }
        def testTransferF32Raw(a : Float, iSrc : Boolean, iDst : Boolean): Unit ={
          val rd = Random.nextInt(32)
          def handle(v : Float): Unit ={
            val refUnclamped = a
            val ref = a
            assert(f2b(v) == f2b(ref), f"$a = $v, $ref")
          }
@ -482,6 +539,49 @@ class FpuTest extends FunSuite{
          flagMatch(0, f"$a")
        }
        def testTransferF64Raw(a : Double): Unit ={
          val rd = Random.nextInt(32)
          def handle(v : Double): Unit ={
            val ref = a
            assert(d2b(v) == d2b(ref), f"$a = $v, $ref")
          }
          load(rd, a)
          store(rd)(handle)
          flagMatch(0, f"$a")
        }
        def testTransferF32F64Raw(a : Float,  iSrc : Boolean): Unit ={
          val rd = Random.nextInt(32)
          if(iSrc) fmv_w_x(rd, f2b(a)) else load(rd, a)
          storeRaw(rd, FpuFormat.DOUBLE){rsp =>
            val v = rsp.value.toBigInt.toLong
            val ref = (0xFFFFFFFFl << 32) | f2b(a)
            assert(v == ref, f"$a = $v, $ref")
          }
          flagMatch(0, f"$a")
        }
        def testTransferF64F32Raw(a : Double,  iDst : Boolean): Unit ={
          val rd = Random.nextInt(32)
          load(rd, a)
          if(iDst)fmv_x_w(rd){v_ =>
            val v = f2b(v_).toLong & 0xFFFFFFFFl
            val ref = d2b(a) & 0xFFFFFFFFl
            assert(v == ref, f"$a = $v, $ref")
          }
          else storeRaw(rd, FpuFormat.FLOAT){rsp =>
            val v = rsp.value.toBigInt.toLong & 0xFFFFFFFFl
            val ref = d2b(a) & 0xFFFFFFFFl
            assert(v == ref, f"$a = $v, $ref")
          }
          flagMatch(0, f"$a")
        }
        def testClassRaw(a : Float) : Unit = {
          val rd = Random.nextInt(32)
@ -513,7 +613,7 @@ class FpuTest extends FunSuite{
          load(rs2, b)
          load(rs3, c)
-          fma(rd,rs1,rs2,rs3)
+          fma(rd,rs1,rs2,rs3, FpuRoundMode.RNE, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val ref = a.toDouble * b.toDouble + c.toDouble
            println(f"$a%.20f * $b%.20f + $c%.20f = $v%.20f, $ref%.20f")
@ -530,7 +630,7 @@ class FpuTest extends FunSuite{
          load(rs1, a)
          load(rs2, b)
-          div(rd,rs1,rs2)
+          div(rd,rs1,rs2, FpuRoundMode.RNE, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val refUnclamped = a/b
            val refClamped = ((a)/(b))
@ -547,7 +647,7 @@ class FpuTest extends FunSuite{
          val rd = Random.nextInt(32)
          load(rs1, a)
-          sqrt(rd,rs1)
+          sqrt(rd,rs1, FpuRoundMode.RNE, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val ref = Math.sqrt(a).toFloat
            val error = Math.abs(ref-v)/ref
@ -564,7 +664,7 @@ class FpuTest extends FunSuite{
          val rd = Random.nextInt(32)
          load(rs1, a)
-          sqrt(rd,rs1)
+          sqrt(rd,rs1,  FpuRoundMode.RNE, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val error = Math.abs(ref-v)/ref
            println(f"sqrt($a) = $v, $ref $error $rounding")
@ -579,7 +679,7 @@ class FpuTest extends FunSuite{
          load(rs1, a)
          load(rs2, b)
-          div(rd,rs1, rs2)
+          div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val error = Math.abs(ref-v)/ref
            println(f"div($a, $b) = $v, $ref $error $rounding")
@ -594,16 +694,16 @@ class FpuTest extends FunSuite{
          val rs1 = rs.allocate()
          val rd = Random.nextInt(32)
          load(rs1, a)
-          f2i(rs1, signed, rounding){rsp =>
+          f2i(rs1, signed, rounding, FpuFormat.FLOAT){rsp =>
            if(signed) {
-              val v = rsp.value.toLong.toInt
+              val v = rsp.value.toBigInt.toInt
              var ref2 = ref
              if(a >= Int.MaxValue) ref2 = Int.MaxValue
              if(a <= Int.MinValue) ref2 = Int.MinValue
              if(a.isNaN) ref2 = Int.MaxValue
              assert(v == (ref2), f" <= f2i($a) = $v, $ref2, $rounding, $flag")
            } else {
-              val v = rsp.value.toLong
+              val v = rsp.value.toBigInt.toLong & 0xFFFFFFFFl
              var ref2 = ref.toLong & 0xFFFFFFFFl
              if(a < 0) ref2 = 0
              if(a >= 0xFFFFFFFFl) ref2 = 0xFFFFFFFFl
@ -621,15 +721,15 @@ class FpuTest extends FunSuite{
        def testI2fExact(a : Int, b : Float, f : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
          val rs = new RegAllocator()
          val rd = Random.nextInt(32)
-          i2f(rd, a, signed, rounding)
+          i2f(rd, a, signed, rounding, FpuFormat.FLOAT)
          storeFloat(rd){v =>
            val aLong = if(signed) a.toLong else a.toLong & 0xFFFFFFFFl
            val ref = b
-            assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref")
+            assert(f2b(v) == f2b(ref), f"i2f($aLong) = $v, $ref $rounding")
          }
-          flagMatch(f, b, f"i2f() = $b")
+          flagMatch(f, b, f"i2f($a) = $b")
        }
@ -640,7 +740,7 @@ class FpuTest extends FunSuite{
          val rd = Random.nextInt(32)
          load(rs1, a)
          load(rs2, b)
-          cmp(rs1, rs2, arg){rsp =>
+          cmp(rs1, rs2, arg, FpuFormat.FLOAT){rsp =>
            val v = rsp.value.toLong
            assert(v === ref, f"cmp($a, $b, $arg) = $v, $ref")
          }
@ -744,29 +844,6 @@ class FpuTest extends FunSuite{
          }
        }
 //        for(i <- 0 until 64){
 //          val rounding = FpuRoundMode.RMM
 //          val a = 24f
 //          val b = b2f(0x3f800000+i)
 //          val c = Clib.math.mulF32(a, b, rounding.position)
 //          val f = 0
 //          testMulExact(a,b,c,f, rounding)
 //        }
        val binaryOps = List[(Int,Int,Int,FpuRoundMode.E) => Unit](add, sub, mul)
 //        testSqrt(0.0f)
        //        testSqrt(1.2f)
        //        for(a <- fAll) testSqrt(a)
 //        for(_ <- 0 until 1000) testSqrt(randomFloat())
        def testFma() : Unit = {
          testFmaRaw(randomFloat(), randomFloat(), randomFloat())
          flagClear()
@ -786,13 +863,13 @@ class FpuTest extends FunSuite{
          testEqRaw(a,b,i, f)
        }
-        def testF2ui() : Unit = {
+        def testF2uiF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,f) = f32.f2ui(rounding).f32_i32
          testF2iExact(a,b, f, false, rounding)
        }
-        def testF2i() : Unit = {
+        def testF2iF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,f) = f32.f2i(rounding).f32_i32
          testF2iExact(a,b, f, true, rounding)
@ -823,11 +900,26 @@ class FpuTest extends FunSuite{
          testSgnjxRaw(a, b)
        }
-        def testTransfer() : Unit = {
+        def testTransferF32() : Unit = {
          val (a,b,r,f) = f32.transfer.RAW.f32_f32_i32
-          testTransferRaw(a, Random.nextBoolean(), Random.nextBoolean())
+          testTransferF32Raw(a, Random.nextBoolean(), Random.nextBoolean())
        }
        def testTransferF64() : Unit = {
          val (a,b,r,f) = f64.transfer.RAW.f64_f64_i32
          testTransferF64Raw(a)
        }
        def testTransferF64F32() : Unit = {
          val (a,b,r,f) = f64.f32.RAW.f64_f64_i32
          testTransferF64F32Raw(a, Random.nextBoolean())
        }
        def testTransferF32F64() : Unit = {
          val (a,b,r,f) = f32.f64.RAW.f32_f32_i32
          testTransferF32F64Raw(a, Random.nextBoolean())
        }
        def testClass() : Unit = {
          val (a,b,r,f) = f32.fclass.RAW.f32_f32_i32
          testClassRaw(a)
@ -854,59 +946,112 @@ class FpuTest extends FunSuite{
          testI2fExact(a,b,f, false, rounding)
        }
-        def testMul() : Unit = {
+        def testMulF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,c,f) = f32.mul(rounding).f32_f32_f32
          testBinaryOp(mul,a,b,c,f, rounding,"mul")
        }
-        def testAdd() : Unit = {
+        def testAddF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,c,f) = f32.add(rounding).f32_f32_f32
          testBinaryOp(add,a,b,c,f, rounding,"add")
        }
-        def testSub() : Unit = {
+        def testSubF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,c,f) = f32.sub(rounding).f32_f32_f32
          testBinaryOp(sub,a,b,c,f, rounding,"sub")
        }
        def testMulF64() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,c,f) = f64.mul(rounding).f64_f64_f64
          testBinaryOpF64(mul,a,b,c,f, rounding,"mul")
        }
-        val f32Tests = List[() => Unit](testSub, testAdd, testMul, testI2f, testUI2f, testMin, testMax, testSgnj, testTransfer, testDiv, testSqrt, testF2i, testF2ui, testLe, testEq, testLt, testClass, testFma)
+
        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f, testUI2f, testMin, testMax, testSgnj, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLe, testEq, testLt, testClass, testFma)
        //TODO test boxing
        if(p.withDouble) {
 //          for(_ <- 0 until 10000) testUI2f64()
 //          for(_ <- 0 until 10000) testI2f64()
 //          println("f64 i2f done")
 //
 //          for(_ <- 0 until 10000) testF2uiF64()
 //          for(_ <- 0 until 10000) testF2iF64()
 //          println("f64 f2i done")
 //          testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
 //          testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
 //          testF2iExact(2.5f,2, 1, false, FpuRoundMode.RTZ)
          testBinaryOpF64(mul,1.0, 1.0, 1.0,0 , FpuRoundMode.RNE,"mul")
          testBinaryOpF64(mul,1.0, 2.0, 2.0,0 , FpuRoundMode.RNE,"mul")
          testBinaryOpF64(mul,2.5, 2.0, 5.0,0 , FpuRoundMode.RNE,"mul")
-        testTransferRaw(1.0f, false, false)
+          for(_ <- 0 until 10000) testMulF64()
-        testTransferRaw(2.0f, false, false)
+          println("f64 Mul done")
        testTransferRaw(2.5f, false, false)
        testTransferRaw(6.97949770801e-39f, false, false)
        testTransferRaw(8.72437213501e-40f, false, false)
        testTransferRaw(5.6E-45f, false, false)
          testTransferF64Raw(1.0)
          testTransferF64Raw(2.0)
          testTransferF64Raw(2.5)
          testTransferF64Raw(6.97949770801e-39)
          testTransferF64Raw(8.72437213501e-40)
          testTransferF64Raw(5.6E-45)
          testTransferF32F64Raw(b2f(0xFFFF1234), false)
          testTransferF64F32Raw(b2d(0xFFF123498765463l << 4), false)
          testTransferF32F64Raw(b2f(0xFFFF1234), true)
          testTransferF64F32Raw(b2d(0xFFF123498765463l << 4), true)
          for (_ <- 0 until 10000) testTransferF64()
          println("f64 load/store/rf transfer done")
          for (_ <- 0 until 10000) testTransferF64F32()
          println("f64 -> f32 load/store/rf transfer done")
-        for(_ <- 0 until 10000) testTransfer()
+          for (_ <- 0 until 10000) testTransferF32F64()
          println("f32 -> f64 load/store/rf transfer done")
        }
        for(_ <- 0 until 10000) testTransferF32()
        println("f32 load/store/rf transfer done")
-        for(_ <- 0 until 10000) testF2ui()
+        for(_ <- 0 until 10000) testMulF32()
-        for(_ <- 0 until 10000) testF2i()
+        println("Mul done")
-        println("f2i done")
+
        for(_ <- 0 until 10000) testUI2f()
        for(_ <- 0 until 10000) testI2f()
        println("i2f done")
        testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
        testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
        testF2iExact(2.5f,2, 1, false, FpuRoundMode.RTZ)
        for(_ <- 0 until 10000) testF2uiF32()
        for(_ <- 0 until 10000) testF2iF32()
        println("f2i done")
 //        waitUntil(cmdQueue.isEmpty)
 //        dut.clockDomain.waitSampling(1000)
 //        simSuccess()
        for(i <- 0 until 1000) testFma()
        flagClear()
        println("fma done") //TODO
@ -959,14 +1104,11 @@ class FpuTest extends FunSuite{
        for(_ <- 0 until 10000) testMul()
        println("Mul done")
-        for(_ <- 0 until 10000) testAdd()
+        for(_ <- 0 until 10000) testAddF32()
-        for(_ <- 0 until 10000) testSub()
+        for(_ <- 0 until 10000) testSubF32()
        println("Add done")