fpu merged i2f with load pipeline

2025-01-03 03:43:39 -05:00 · 2021-01-26 15:28:09 +01:00 · 2021-01-26 15:28:09 +01:00 · 444bcdba0a
commit 444bcdba0a
parent 3334364f5f
4 changed files with 146 additions and 135 deletions
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@ -47,6 +47,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val source = Source()
    val rd = p.rfAddress()
    val lockId = lockIdType()
+    val i2f = Bool()
+    val arg = Bits(2 bits)
  }

  case class ShortPipInput() extends Bundle{
@ -228,13 +230,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val input = read.output.combStage()
    input.ready := False

-    val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X).map(input.opcode === _).orR
+    val loadHit = List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(input.opcode === _).orR
    val load = Stream(LoadInput())
    load.valid := input.valid && loadHit
    input.ready setWhen(loadHit && load.ready)
    load.payload.assignSomeByName(read.output.payload)
+    load.i2f := input.opcode === FpuOpcode.I2F

-    val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.I2F, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS).map(input.opcode === _).orR
+    val shortPipHit = List(FpuOpcode.STORE, FpuOpcode.F2I, FpuOpcode.CMP, FpuOpcode.MIN_MAX, FpuOpcode.SGNJ, FpuOpcode.FMV_X_W, FpuOpcode.FCLASS).map(input.opcode === _).orR
    val shortPip = Stream(ShortPipInput())
    input.ready setWhen(shortPipHit && shortPip.ready)
    shortPip.valid := input.valid && shortPipHit
@ -289,7 +292,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val source = Source()
      val lockId = lockIdType()
      val rd = p.rfAddress()
-      val value = FpuFloat(exponentSize = p.internalExponentSize-1, mantissaSize = p.internalMantissaSize)
+      val value = p.storeLoadType()
+      val i2f = Bool()
+      val arg = Bits(2 bits)
    }

    val s0 = new Area{
@ -304,18 +309,28 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      output.source := input.source
      output.lockId := input.lockId
      output.rd := input.rd
-      output.value.mantissa := feed.value(0, 23 bits).asUInt
-      output.value.exponent := feed.value(23, 8 bits).asUInt
-      output.value.sign := feed.value(31)
+      output.value := feed.value
+      output.i2f := input.i2f
+      output.arg := input.arg
    }

+
+//    val i2fSign = input.arg(0) && input.value.msb
+//    val i2fUnsigned = input.value.asUInt.twoComplement(i2fSign).resize(32 bits)
+//    val i2fLog2 = OHToUInt(OHMasking.last(i2fUnsigned))
+//    val i2fShifted = (i2fUnsigned << p.internalMantissaSize) >> i2fLog2
+//    rfOutput.value.sign := i2fSign
+//    rfOutput.value.exponent := i2fLog2 +^ exponentOne
+//    rfOutput.value.mantissa := U(i2fShifted).resized
+//    rfOutput.value.special := False //TODO
+
    val s1 = new Area{
      val input = s0.output.stage()
      val busy = False

-      val f32Mantissa = input.value.mantissa
-      val f32Exponent = input.value.exponent
-      val f32Sign     = input.value.sign
+      val f32Mantissa = input.value(0, 23 bits).asUInt
+      val f32Exponent = input.value(23, 8 bits).asUInt
+      val f32Sign     = input.value(31)

      val expZero = f32Exponent === 0
      val expOne =  f32Exponent === 255
@ -329,18 +344,31 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val isNan       =  expOne  && !manZero
      val isQuiet     = f32Mantissa.msb

-      val subnormal = new Area{
+      val fsm = new Area{
        val manTop = Reg(UInt(log2Up(p.internalMantissaSize) bits))
-        val shift = isSubnormal ? manTop | U(0)
+        val shift =  CombInit(manTop)
        val counter = Reg(UInt(log2Up(p.internalMantissaSize+1) bits))
-        val done, boot = Reg(Bool())
-        when(isSubnormal && !done){
+        val done, boot, patched = Reg(Bool())
+        val ohInput = CombInit(input.value(0, 32 max p.internalMantissaSize bits))
+        when(!input.i2f) { ohInput(9, 23 bits) := input.value(0, 23 bits) }
+        val i2fZero = Reg(Bool)
+        when(input.valid && (input.i2f || isSubnormal) && !done){
          busy := True
          when(boot){
-            manTop := OHToUInt(OHMasking.first((f32Mantissa).reversed))
-            boot := False
+            when(input.i2f && !patched && input.value.msb && input.arg(0)){
+              input.value.getDrivingReg(0, 32 bits) := B(input.value.asUInt.twoComplement(True).resize(32 bits))
+              patched := True
+            } otherwise {
+              manTop := OHToUInt(OHMasking.first((ohInput).reversed))
+              boot := False
+              i2fZero := input.value(31 downto 0) === 0
+            }
          } otherwise {
-            input.value.mantissa.getDrivingReg := input.value.mantissa |<< 1
+            when(input.i2f){
+              input.value.getDrivingReg(0, 32 bits) := input.value(0, 32 bits) |<< 1
+            } otherwise {
+              input.value.getDrivingReg(0, 23 bits) := input.value(0, 23 bits) |<< 1
+            }
            counter := counter + 1
            when(counter === shift) {
              done := True
@ -358,16 +386,20 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
          counter := 0
          done := False
          boot := True
+          patched := False
        }
      }

+
+      val i2fSign = fsm.patched
+      val i2fShifted = input.value.takeHigh(23)
+
      val recoded = p.internalFloating()
      recoded.mantissa := f32Mantissa
-      recoded.exponent := (f32Exponent -^ subnormal.expOffset + (exponentOne - 127)).resized
+      recoded.exponent := (f32Exponent -^ fsm.expOffset + (exponentOne - 127)).resized
      recoded.sign     := f32Sign
      recoded.setNormal
      when(isZero){recoded.setZero}
-      //when(isSubnormal){recoded.setSubnormal}
      when(isInfinity){recoded.setInfinity}
      when(isNan){recoded.setNan}

@ -376,6 +408,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      output.lockId := input.lockId
      output.rd := input.rd
      output.value := recoded
+      when(input.i2f){
+        output.value.sign := i2fSign
+        output.value.exponent := (U(exponentOne+31) - fsm.manTop).resized
+        output.value.mantissa := U(i2fShifted)
+        output.value.setNormal
+        when(fsm.i2fZero) { output.value.setZero }
+      }
    }
  }

@ -401,10 +440,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val subnormal = new Area{
      val needRecoding = List(FpuOpcode.FMV_X_W, FpuOpcode.STORE).map(_ === input.opcode).orR
      val manTop = Reg(UInt(log2Up(p.internalMantissaSize) bits))
-      val shift = isSubnormal ? manTop | U(0)
      val counter = Reg(UInt(log2Up(p.internalMantissaSize+1) bits))
      val done, boot = Reg(Bool())
-      when(needRecoding && isSubnormal && !done){
+      when(input.valid && needRecoding && isSubnormal && !done){
        halt := True
        when(boot){
          manTop := (U(exponentOne - 127) - recoded.exponent).resized
@ -412,7 +450,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
        } otherwise {
          recoded.mantissa.getDrivingReg := (U(counter === 0) @@ recoded.mantissa) >> 1
          counter := counter + 1
-          when(counter === shift) {
+          when(counter === manTop) {
            done := True
          }
        }
@ -450,11 +488,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val f2iUnsigned = f2iShifted >> p.internalMantissaSize
    val f2iResult = (f2iUnsigned.twoComplement(input.arg(0) && input.rs1.sign)).asBits.resize(32 bits)

-    val i2fSign = input.arg(0) && input.value.msb
-    val i2fUnsigned = input.value.asUInt.twoComplement(i2fSign).resize(32 bits)
-    val i2fLog2 = OHToUInt(OHMasking.last(i2fUnsigned))
-    val i2fShifted = (i2fUnsigned << p.internalMantissaSize) >> i2fLog2
-
    val bothZero = input.rs1.isZero && input.rs2.isZero
    val rs1Equal = input.rs1 === input.rs2
    val rs1AbsSmaller = (input.rs1.exponent @@ input.rs1.mantissa) < (input.rs2.exponent @@ input.rs2.mantissa)
@ -496,7 +529,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      is(FpuOpcode.FCLASS)  { result := fclassResult.resized }
    }

-    val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.I2F, FpuOpcode.SGNJ).map(input.opcode === _).orR
+    val toFpuRf = List(FpuOpcode.MIN_MAX, FpuOpcode.SGNJ).map(input.opcode === _).orR

    rfOutput.valid := input.valid && toFpuRf && !halt
    rfOutput.source := input.source
@ -504,12 +537,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    rfOutput.rd := input.rd
    rfOutput.value.assignDontCare()
    switch(input.opcode){
-      is(FpuOpcode.I2F){
-        rfOutput.value.sign := i2fSign
-        rfOutput.value.exponent := i2fLog2 +^ exponentOne
-        rfOutput.value.mantissa := U(i2fShifted).resized
-        rfOutput.value.special := False //TODO
-      }
      is(FpuOpcode.MIN_MAX){
        rfOutput.value := minMaxResult
      }
@ -550,7 +577,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      val exp = math.exp + U(needShift)
      val man = needShift ? mulRounded(1, p.internalMantissaSize bits) | mulRounded(0, p.internalMantissaSize bits)

-      val forceZero = input.rs1.isZeroOrSubnormal || input.rs2.isZeroOrSubnormal
+      val forceZero = input.rs1.isZero || input.rs2.isZero
      val forceUnderflow = exp <= exponentOne + exponentOne - 127 - 23  // 0x6A //TODO
      val forceOverflow = exp > exponentOne + exponentOne + 127 || input.rs1.isInfinity || input.rs2.isInfinity
      val forceNan = input.rs1.isNan || input.rs2.isNan || ((input.rs1.isInfinity || input.rs2.isInfinity) && (input.rs1.isZero || input.rs2.isZero))
@ -717,8 +744,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
        decode.divSqrtToMul.rs2.exponent := divExp.value + iterationValue.msb.asUInt
        decode.divSqrtToMul.rs2.mantissa := (iterationValue << 1).resized
        val zero = input.rs2.isInfinity
-        val overflow = input.rs2.isZeroOrSubnormal
-        val nan = input.rs2.isNan || (input.rs1.isZeroOrSubnormal && input.rs2.isZeroOrSubnormal)
+        val overflow = input.rs2.isZero
+        val nan = input.rs2.isNan || (input.rs1.isZero && input.rs2.isZero)

        when(nan){
          decode.divSqrtToMul.rs2.setNanQuiet
@ -785,12 +812,12 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{

    val shifter = new Area {
      val exp21 = input.rs2.exponent -^ input.rs1.exponent
-      val rs1ExponentBigger = (exp21.msb || input.rs2.isZeroOrSubnormal) && !input.rs1.isZeroOrSubnormal
+      val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero
      val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent
      val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa
-      val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZeroOrSubnormal || input.rs1.isInfinity) && !input.rs2.isInfinity
+      val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity
      val shiftBy = rs1ExponentBigger ? (0-exp21) | exp21
-      val passThrough = shiftBy >= p.internalMantissaSize || (input.rs1.isZeroOrSubnormal) || (input.rs2.isZeroOrSubnormal)
+      val passThrough = shiftBy >= p.internalMantissaSize || (input.rs1.isZero) || (input.rs2.isZero)

      //Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
      val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
@ -827,8 +854,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 //      val mantissaShifted = (xyMantissa |<< shift)
 //      val mantissa = ((xyMantissa ) >> 2) + U(xyMantissa(1))
      val exponent = xyExponent -^ shift + 1
-      xySign clearWhen(input.rs1.isZeroOrSubnormal && input.rs2.isZeroOrSubnormal)
-      val forceZero = xyMantissa === 0 || exponent.msb || (input.rs1.isZeroOrSubnormal && input.rs2.isZeroOrSubnormal)
+      xySign clearWhen(input.rs1.isZero && input.rs2.isZero)
+      val forceZero = xyMantissa === 0 || exponent.msb || (input.rs1.isZero && input.rs2.isZero)
      val forceOverflow = exponent === exponentOne + 128 ||  (input.rs1.isInfinity || input.rs2.isInfinity)
      val forceNan = input.rs1.isNan || input.rs2.isNan || (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign))
    }
@ -847,7 +874,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      output.value.setNanQuiet
    } elsewhen(norm.forceZero) {
      output.value.setZero
-      when(norm.xyMantissa === 0 || input.rs1.isZeroOrSubnormal && input.rs2.isZeroOrSubnormal){
+      when(norm.xyMantissa === 0 || input.rs1.isZero && input.rs2.isZero){
        output.value.sign := input.rs1.sign && input.rs2.sign
      }
    } elsewhen(norm.forceOverflow) {
@ -856,26 +883,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
  }


-//  val format = new Area{
-//    val input = pipeArbiter.arbitrated.combStage()
-//
-//    val rotate = new Area{
-//      val input = Bits(p.internalMantissaSize bits)
-//      val shift = UInt(log2Up(p.internalMantissaSize) bits)
-//      val output = input.rotateLeft(shift)
-//    }
-//
-//    val decode = new Area{
-//      val sign = input.raw(31)
-//      val exp = input.raw(23, 8 bits).asUInt
-//      val man = input.raw(23, 8 bits).asUInt
-//      val isSubnormal = exp === 0 //zero ?
-//      val manTop = OHToUInt(OHMasking.first((man ## U"1").reversed))
-//      val shift = isSubnormal ? manTop | U(0)
-//      rotate.shift := shift
-//    }
-//  }
-
  val write = new Area{
    val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(List(load.s1.output, add.output, mul.output, shortPip.rfOutput))
    val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId)
--- a/src/main/scala/vexriscv/ip/fpu/Interface.scala
+++ b/src/main/scala/vexriscv/ip/fpu/Interface.scala
@ -25,9 +25,8 @@ case class FpuFloatDecoded() extends Bundle{

 object FpuFloat{
  val ZERO = 0
-  val SUBNORMAL = 1
-  val INFINITY = 2
-  val NAN = 3
+  val INFINITY = 1
+  val NAN = 2
 }

 case class FpuFloat(exponentSize: Int,
@ -45,22 +44,17 @@ case class FpuFloat(exponentSize: Int,
    ret
  }

-
-  def isZeroOrSubnormal =  special && exponent(1) === False
-
  def isNormal    = !special
-  def isZero      =  special && exponent(1 downto 0) === 0
-  //def isSubnormal =  special && exponent(1 downto 0) === 1
-  def isInfinity  =  special && exponent(1 downto 0) === 2
-  def isNan       =  special && exponent(1 downto 0) === 3
+  def isZero      =  special && exponent(1 downto 0) === FpuFloat.ZERO
+  def isInfinity  =  special && exponent(1 downto 0) === FpuFloat.INFINITY
+  def isNan       =  special && exponent(1 downto 0) === FpuFloat.NAN
  def isQuiet     =  mantissa.msb

  def setNormal    =  { special := False }
-  def setZero      =  { special := True; exponent(1 downto 0) := 0 }
-  //def setSubnormal =  { special := True; exponent(1 downto 0) := 1 }
-  def setInfinity  =  { special := True; exponent(1 downto 0) := 2 }
-  def setNan       =  { special := True; exponent(1 downto 0) := 3 }
-  def setNanQuiet  =  { special := True; exponent(1 downto 0) := 3; mantissa.msb := True }
+  def setZero      =  { special := True; exponent(1 downto 0) := FpuFloat.ZERO }
+  def setInfinity  =  { special := True; exponent(1 downto 0) := FpuFloat.INFINITY }
+  def setNan       =  { special := True; exponent(1 downto 0) := FpuFloat.NAN }
+  def setNanQuiet  =  { special := True; exponent(1 downto 0) := FpuFloat.NAN ; mantissa.msb := True }

  def decode() = {
    val ret = FpuFloatDecoded()
@ -122,7 +116,6 @@ case class FpuCompletion() extends Bundle{

 case class FpuCmd(p : FpuParameter) extends Bundle{
  val opcode = p.Opcode()
-  val value = Bits(32 bits) // Int to float
  val arg = Bits(2 bits) 
  val rs1, rs2, rs3 = p.rfAddress()
  val rd = p.rfAddress()
--- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala
@ -161,15 +161,13 @@ class FpuPlugin(externalFpu : Boolean = false,
      //Maybe it might be better to not fork before fire to avoid RF stall on commits
      val forked = Reg(Bool) setWhen(port.cmd.fire) clearWhen(!arbitration.isStuck) init(False)

-      val intRfReady = Reg(Bool()) setWhen(!arbitration.isStuckByOthers) clearWhen(!arbitration.isStuck) //TODO is that still in use ?
-      val hazard = (input(RS1_USE) && !intRfReady) || csr.pendings.msb || csr.csrActive
+      val hazard = csr.pendings.msb || csr.csrActive

      arbitration.haltItself setWhen(arbitration.isValid && input(FPU_ENABLE) && hazard)
      arbitration.haltItself setWhen(port.cmd.isStall)

      port.cmd.valid    := arbitration.isValid && input(FPU_ENABLE) && !forked && !hazard
      port.cmd.opcode   := input(FPU_OPCODE)
-      port.cmd.value    := RegNext(output(RS1))
      port.cmd.arg      := input(FPU_ARG)
      port.cmd.rs1      := ((input(FPU_OPCODE) === FpuOpcode.STORE) ? input(INSTRUCTION)(rs2Range).asUInt | input(INSTRUCTION)(rs1Range).asUInt)
      port.cmd.rs2      := input(INSTRUCTION)(rs2Range).asUInt
@ -179,7 +177,7 @@ class FpuPlugin(externalFpu : Boolean = false,

      insert(FPU_FORKED) := forked || port.cmd.fire

-      insert(FPU_COMMIT_SYNC) := List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X).map(_ === input(FPU_OPCODE)).orR
+      insert(FPU_COMMIT_SYNC) := List(FpuOpcode.LOAD, FpuOpcode.FMV_W_X, FpuOpcode.I2F).map(_ === input(FPU_OPCODE)).orR
      insert(FPU_COMMIT_LOAD) := input(FPU_OPCODE) === FpuOpcode.LOAD
    }

--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@ -64,7 +64,6 @@ class FpuTest extends FunSuite{
        def loadRaw(rd : Int, value : BigInt): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.LOAD
-            cmd.value.randomize()
            cmd.rs1.randomize()
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -85,7 +84,6 @@ class FpuTest extends FunSuite{
        def storeRaw(rs : Int)(body : FpuRsp => Unit): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.STORE
-            cmd.value.randomize()
            cmd.rs1 #= rs
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -103,7 +101,6 @@ class FpuTest extends FunSuite{
        def mul(rd : Int, rs1 : Int, rs2 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.MUL
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -119,7 +116,6 @@ class FpuTest extends FunSuite{
        def add(rd : Int, rs1 : Int, rs2 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.ADD
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -135,7 +131,6 @@ class FpuTest extends FunSuite{
        def div(rd : Int, rs1 : Int, rs2 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.DIV
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -151,7 +146,6 @@ class FpuTest extends FunSuite{
        def sqrt(rd : Int, rs1 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.SQRT
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -167,7 +161,6 @@ class FpuTest extends FunSuite{
        def fma(rd : Int, rs1 : Int, rs2 : Int, rs3 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.FMA
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3 #= rs3
@ -184,7 +177,6 @@ class FpuTest extends FunSuite{
        def cmp(rs1 : Int, rs2 : Int)(body : FpuRsp => Unit): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.CMP
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -197,7 +189,6 @@ class FpuTest extends FunSuite{
        def f2i(rs1 : Int, signed : Boolean)(body : FpuRsp => Unit): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.F2I
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -210,7 +201,6 @@ class FpuTest extends FunSuite{
        def i2f(rd : Int, value : Int, signed : Boolean): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.I2F
-            cmd.value #= value.toLong & 0xFFFFFFFFl
            cmd.rs1.randomize()
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -219,14 +209,14 @@ class FpuTest extends FunSuite{
          }
          commitQueue += {cmd =>
            cmd.write #= true
-            cmd.sync #= false
+            cmd.sync #= true
+            cmd.value #= value.toLong & 0xFFFFFFFFl
          }
        }

        def fmv_x_w(rs1 : Int)(body : FpuRsp => Unit): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.FMV_X_W
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -239,7 +229,6 @@ class FpuTest extends FunSuite{
        def fmv_w_x(rd : Int, value : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.FMV_W_X
-            cmd.value.randomize()
            cmd.rs1.randomize()
            cmd.rs2.randomize()
            cmd.rs3.randomize()
@ -256,7 +245,6 @@ class FpuTest extends FunSuite{
        def min(rd : Int, rs1 : Int, rs2 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.MIN_MAX
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -273,7 +261,6 @@ class FpuTest extends FunSuite{
        def sgnj(rd : Int, rs1 : Int, rs2 : Int): Unit ={
          cmdQueue += {cmd =>
            cmd.opcode #= cmd.opcode.spinalEnum.SGNJ
-            cmd.value.randomize()
            cmd.rs1 #= rs1
            cmd.rs2 #= rs2
            cmd.rs3.randomize()
@ -447,9 +434,11 @@ class FpuTest extends FunSuite{
          val rd = Random.nextInt(32)
          i2f(rd, a, signed)
          storeFloat(rd){v =>
-            val ref = a.toInt
-            println(f"i2f($a) = $v, $ref")
-            assert(v === ref)
+            val aLong = if(signed) a.toLong else a.toLong & 0xFFFFFFFFl
+            val ref = if(signed) a.toFloat else (a.toLong & 0xFFFFFFFFl).toFloat
+            println(f"i2f($aLong) = $v, $ref")
+            if(ref.abs < (1 << 22)) assert(v === ref)
+            assert(checkFloat(v, ref))
          }
        }

@ -542,6 +531,65 @@ class FpuTest extends FunSuite{
        val fNan = List(Float.NaN, b2f(0x7f820000), b2f(0x7fc00000))
        val fAll = fZeros ++ fSubnormals ++ fExpSmall ++ fExpNormal ++ fExpBig ++ fInfinity ++ fNan

+        val iSmall = (0 to 20)
+        val iBigUnsigned = (0 to 20).map(e => 0xFFFFFFFF - e)
+        val iBigSigned = (0 to 20).map(e => 0x7FFFFFFF - e) ++ (0 to 20).map(e => 0x80000000 + e)
+        val iUnsigned = iSmall ++ iBigUnsigned
+        val iSigned = iSmall ++ iSmall.map(-_) ++ iBigSigned
+
+
+        testLoadStore(1.17549435082e-38f)
+        testLoadStore(1.4E-45f)
+        testLoadStore(3.44383110592e-41f)
+
+        testAdd(b2f(0x3f800000), b2f(0x3f800000-1))
+        testAdd(1.1f, 2.3f)
+        testAdd(1.2f, -1.2f)
+        testAdd(-1.2f, 1.2f)
+        testAdd(0.0f, -1.2f)
+        testAdd(-0.0f, -1.2f)
+        testAdd(1.2f, -0f)
+        testAdd(1.2f, 0f)
+        testAdd(1.1f, Float.MinPositiveValue)
+
+        for(a <- fAll; _ <- 0 until 50) testAdd(a, randomFloat())
+        for(b <- fAll; _ <- 0 until 50) testAdd(randomFloat(), b)
+        for(a <- fAll; b <- fAll) testAdd(a, b)
+        for(_ <- 0 until 1000) testAdd(randomFloat(), randomFloat())
+
+
+
+        testLoadStore(1.2f)
+        testMul(1.2f, 2.5f)
+        testMul(b2f(0x00400000), 16.0f)
+        testMul(b2f(0x00100000), 16.0f)
+        testMul(b2f(0x00180000), 16.0f)
+        testMul(b2f(0x00000004), 16.0f)
+        testMul(b2f(0x00000040), 16.0f)
+        testMul(b2f(0x00000041), 16.0f)
+        testMul(b2f(0x00000001), b2f(0x00000001))
+        testMul(1.0f, b2f(0x00000001))
+        testMul(0.5f, b2f(0x00000001))
+
+        //        dut.clockDomain.waitSampling(1000)
+        //        simSuccess()
+
+        testMul(1.2f, 0f)
+        for(a <- fAll; _ <- 0 until 50) testMul(a, randomFloat())
+        for(b <- fAll; _ <- 0 until 50) testMul(randomFloat(), b)
+        for(a <- fAll; b <- fAll) testMul(a, b)
+        for(_ <- 0 until 1000) testMul(randomFloat(), randomFloat())
+
+
+
+        testLoadStore(1.765f)
+        testFmv_w_x(lang.Float.floatToIntBits(7.234f))
+        testI2f(64, false)
+        for(i <- iUnsigned) testI2f(i, false)
+        for(i <- iSigned) testI2f(i, true)
+        for(_ <- 0 until 1000) testI2f(Random.nextInt(), Random.nextBoolean())
+
+
        testCmp(0.0f, 1.2f )
        testCmp(1.2f, 0.0f )
        testCmp(0.0f, -0.0f )
@ -576,41 +624,6 @@ class FpuTest extends FunSuite{



-        testAdd(b2f(0x3f800000), b2f(0x3f800000-1))
-        testAdd(1.1f, 2.3f)
-        testAdd(1.2f, -1.2f)
-        testAdd(-1.2f, 1.2f)
-        testAdd(0.0f, -1.2f)
-        testAdd(-0.0f, -1.2f)
-        testAdd(1.2f, -0f)
-        testAdd(1.2f, 0f)
-        testAdd(1.1f, Float.MinPositiveValue)
-
-        for(a <- fAll; _ <- 0 until 50) testAdd(a, randomFloat())
-        for(b <- fAll; _ <- 0 until 50) testAdd(randomFloat(), b)
-        for(a <- fAll; b <- fAll) testAdd(a, b)
-        for(_ <- 0 until 1000) testAdd(randomFloat(), randomFloat())
-
-        testLoadStore(1.2f)
-        testMul(1.2f, 2.5f)
-        testMul(b2f(0x00400000), 16.0f)
-        testMul(b2f(0x00100000), 16.0f)
-        testMul(b2f(0x00180000), 16.0f)
-        testMul(b2f(0x00000004), 16.0f)
-        testMul(b2f(0x00000040), 16.0f)
-        testMul(b2f(0x00000041), 16.0f)
-        testMul(b2f(0x00000001), b2f(0x00000001))
-        testMul(1.0f, b2f(0x00000001))
-        testMul(0.5f, b2f(0x00000001))
-
-//        dut.clockDomain.waitSampling(1000)
-//        simSuccess()
-
-        testMul(1.2f, 0f)
-        for(a <- fAll; _ <- 0 until 50) testMul(a, randomFloat())
-        for(b <- fAll; _ <- 0 until 50) testMul(randomFloat(), b)
-        for(a <- fAll; b <- fAll) testMul(a, b)
-        for(_ <- 0 until 1000) testMul(randomFloat(), randomFloat())