fpu f64/f32 pass all tests

2021-02-12 14:48:44 +01:00 · 2021-02-12 14:48:44 +01:00 · 7d3b35c32c
parent 9a25a12879
commit 7d3b35c32c
3 changed files with 396 additions and 74 deletions
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@ -56,6 +56,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val arg = p.Arg()
    val roundMode = FpuRoundMode()
    val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
  }


@ -79,6 +80,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val arg = Bits(2 bits)
    val roundMode = FpuRoundMode()
    val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
  }

  case class MulInput() extends Bundle{
@ -198,7 +200,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{

  //TODO nan boxing decoding
  val read = new Area{
-    val arbiter = StreamArbiterFactory.noLock.lowerFirst.build(FpuCmd(p), portCount)
+    val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
    arbiter.io.inputs <> Vec(io.port.map(_.cmd))

    val s0 = Stream(RfReadInput())
@ -261,23 +263,28 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    output.rs2 := rs2Entry.value
    output.rs3 := rs3Entry.value
    if(p.withDouble){
+      output.rs1Boxed := rs1Entry.boxed
+      output.rs2Boxed := rs2Entry.boxed
      output.format := s1.format
      val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
-      when(store){ //Pass through
+      val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE
+      when(!sgnjBypass) {
+        when(store) { //Pass through
          output.format := rs1Entry.boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
-      } elsewhen(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
+        } elsewhen (s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed) {
          output.rs1.setNanQuiet
          output.rs1.sign := False
        }
-      when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed){
+        when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed) {
          output.rs2.setNanQuiet
          output.rs2.sign := False
        }
-      when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed){
+        when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed) {
          output.rs3.setNanQuiet
        }
      }
    }
+  }

  val decode = new Area{
    val input = read.output.combStage()
@ -686,8 +693,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      )
      val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt)
      val overflow  = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan
-      val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && unsigned(30 downto 0) =/= 0 || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
+      val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
      val isZero = input.rs1.isZero
+      if(p.withDouble){
+        overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31)))
+      }
      when(isZero){
        result := 0
      } elsewhen(underflow || overflow) {
@ -720,7 +730,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
    val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan
    val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0))
    when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 }
-    val sgnjResult = (input.rs1.sign && input.arg(1)) ^ input.rs2.sign ^ input.arg(0)
+    val sgnjRs1Sign = CombInit(input.rs1.sign)
+    val sgnjRs2Sign = CombInit(input.rs2.sign)
+    if(p.withDouble){
+      sgnjRs1Sign setWhen(input.rs1Boxed && input.format === FpuFormat.DOUBLE)
+      sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE)
+    }
+    val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0)
    val fclassResult = B(0, 32 bits)
    val decoded = input.rs1.decode()
    fclassResult(0) :=  input.rs1.sign &&  decoded.isInfinity
@ -771,6 +787,22 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
      }
      is(FpuOpcode.SGNJ){
        rfOutput.value.sign := sgnjResult
+        if(p.withDouble) when(input.format === FpuFormat.DOUBLE){
+          when(input.rs1Boxed){
+            rfOutput.value.sign := input.rs1.sign
+            rfOutput.format := FpuFormat.FLOAT
+          }
+//          //kill boxing => F32 -> F64 NAN
+//          when(input.rs1Boxed && !sgnjResult){
+//            rfOutput.value.setNan
+//            rfOutput.value.mantissa.setAll()
+//            rfOutput.value.mantissa(31 downto 0) := input.rs1.sign ## input.rs1.exponent
+//          }
+//          //Spawn boxing => F64 NAN -> F32
+//          when(!input.rs1Boxed && input.rs1.exponent === exponentOne + 1024 && input.rs1.mantissa(32, 52-32 bits).andR && sgnjResult){
+//
+//          }
+        }
      }
      if(p.withDouble) is(FpuOpcode.FCVT_X_X){
        rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT)
--- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala
@ -17,6 +17,7 @@ class FpuPlugin(externalFpu : Boolean = false,
  object FPU_FORKED extends Stageable(Bool())
  object FPU_OPCODE extends Stageable(FpuOpcode())
  object FPU_ARG extends Stageable(Bits(2 bits))
+  object FPU_FORMAT extends Stageable(FpuFormat())

  var port : FpuPort = null

@ -49,6 +50,7 @@ class FpuPlugin(externalFpu : Boolean = false,
    val fminMax = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.MIN_MAX
    val fmvWx   = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.FMV_W_X :+ RS1_USE -> True
    val fcvtI2f = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.I2F     :+ RS1_USE -> True
+    val fcvtxx  = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.FCVT_X_X

    val fcmp    = intRfWrite   :+ FPU_OPCODE -> FpuOpcode.CMP
    val fclass  = intRfWrite   :+ FPU_OPCODE -> FpuOpcode.FCLASS
@ -73,35 +75,69 @@ class FpuPlugin(externalFpu : Boolean = false,
    def arg(v : Int) = FPU_ARG -> U(v, 2 bits)
    val decoderService = pipeline.service(classOf[DecoderService])
    decoderService.addDefault(FPU_ENABLE, False)
+
+    val f32 = FPU_FORMAT -> FpuFormat.FLOAT
+    val f64 = FPU_FORMAT -> FpuFormat.DOUBLE
+
    decoderService.add(List(
-      FADD_S    -> (addSub :+ arg(0)),
-      FSUB_S    -> (addSub :+ arg(1)),
-      FMADD_S   -> (fma :+ arg(0)),
-      FMSUB_S   -> (fma :+ arg(2)),
-      FNMADD_S  -> (fma :+ arg(3)),
-      FNMSUB_S  -> (fma :+ arg(1)),
-      FMUL_S    -> (mul :+ arg(0)),
-      FDIV_S    -> (div),
-      FSQRT_S   -> (sqrt),
-      FLW       -> (fl),
-      FSW       -> (fs),
-      FCVT_S_WU -> (fcvtI2f :+ arg(0)),
-      FCVT_S_W  -> (fcvtI2f :+ arg(1)),
-      FCVT_WU_S -> (fcvtF2i :+ arg(0)),
-      FCVT_W_S ->  (fcvtF2i :+ arg(1)),
-      FCLASS_S  -> (fclass),
-      FLE_S     -> (fcmp :+ arg(0)),
-      FEQ_S     -> (fcmp :+ arg(2)),
-      FLT_S     -> (fcmp :+ arg(1)),
-      FSGNJ_S   -> (fsgnj :+ arg(0)),
-      FSGNJN_S  -> (fsgnj :+ arg(1)),
-      FSGNJX_S  -> (fsgnj :+ arg(2)),
-      FMIN_S    -> (fminMax :+ arg(0)),
-      FMAX_S    -> (fminMax :+ arg(1)),
-      FMV_X_W   -> (fmvXw),
-      FMV_W_X   -> (fmvWx)
+      FADD_S    -> (addSub  :+ f32 :+ arg(0)),
+      FSUB_S    -> (addSub  :+ f32 :+ arg(1)),
+      FMADD_S   -> (fma     :+ f32 :+ arg(0)),
+      FMSUB_S   -> (fma     :+ f32 :+ arg(2)),
+      FNMADD_S  -> (fma     :+ f32 :+ arg(3)),
+      FNMSUB_S  -> (fma     :+ f32 :+ arg(1)),
+      FMUL_S    -> (mul     :+ f32 :+ arg(0)),
+      FDIV_S    -> (div     :+ f32 ),
+      FSQRT_S   -> (sqrt    :+ f32 ),
+      FLW       -> (fl      :+ f32 ),
+      FSW       -> (fs      :+ f32 ),
+      FCVT_S_WU -> (fcvtI2f :+ f32 :+ arg(0)),
+      FCVT_S_W  -> (fcvtI2f :+ f32 :+ arg(1)),
+      FCVT_WU_S -> (fcvtF2i :+ f32 :+ arg(0)),
+      FCVT_W_S ->  (fcvtF2i :+ f32 :+ arg(1)),
+      FCLASS_S  -> (fclass  :+ f32 ),
+      FLE_S     -> (fcmp    :+ f32 :+ arg(0)),
+      FEQ_S     -> (fcmp    :+ f32 :+ arg(2)),
+      FLT_S     -> (fcmp    :+ f32 :+ arg(1)),
+      FSGNJ_S   -> (fsgnj   :+ f32 :+ arg(0)),
+      FSGNJN_S  -> (fsgnj   :+ f32 :+ arg(1)),
+      FSGNJX_S  -> (fsgnj   :+ f32 :+ arg(2)),
+      FMIN_S    -> (fminMax :+ f32 :+ arg(0)),
+      FMAX_S    -> (fminMax :+ f32 :+ arg(1)),
+      FMV_X_W   -> (fmvXw   :+ f32 ),
+      FMV_W_X   -> (fmvWx   :+ f32 )
    ))

+    if(p.withDouble){
+      decoderService.add(List(
+        FADD_D    -> (addSub  :+ f64 :+ arg(0)),
+        FSUB_D    -> (addSub  :+ f64 :+ arg(1)),
+        FMADD_D   -> (fma     :+ f64 :+ arg(0)),
+        FMSUB_D   -> (fma     :+ f64 :+ arg(2)),
+        FNMADD_D  -> (fma     :+ f64 :+ arg(3)),
+        FNMSUB_D  -> (fma     :+ f64 :+ arg(1)),
+        FMUL_D    -> (mul     :+ f64 :+ arg(0)),
+        FDIV_D    -> (div     :+ f64 ),
+        FSQRT_D   -> (sqrt    :+ f64 ),
+        FLW       -> (fl      :+ f64 ),
+        FSW       -> (fs      :+ f64 ),
+        FCVT_S_WU -> (fcvtI2f :+ f64 :+ arg(0)),
+        FCVT_S_W  -> (fcvtI2f :+ f64 :+ arg(1)),
+        FCVT_WU_D -> (fcvtF2i :+ f64 :+ arg(0)),
+        FCVT_W_D  -> (fcvtF2i :+ f64 :+ arg(1)),
+        FCLASS_D  -> (fclass  :+ f64 ),
+        FLE_D     -> (fcmp    :+ f64 :+ arg(0)),
+        FEQ_D     -> (fcmp    :+ f64 :+ arg(2)),
+        FLT_D     -> (fcmp    :+ f64 :+ arg(1)),
+        FSGNJ_D   -> (fsgnj   :+ f64 :+ arg(0)),
+        FSGNJN_D  -> (fsgnj   :+ f64 :+ arg(1)),
+        FSGNJX_D  -> (fsgnj   :+ f64 :+ arg(2)),
+        FMIN_D    -> (fminMax :+ f64 :+ arg(0)),
+        FMAX_D    -> (fminMax :+ f64 :+ arg(1)),
+        FCVT_D_S  -> (fcvtxx :+ f32),
+        FCVT_S_D  -> (fcvtxx :+ f64)
+      ))
+    }
    //TODO FMV_X_X + doubles

    port = FpuPort(p)
@ -178,7 +214,7 @@ class FpuPlugin(externalFpu : Boolean = false,
      port.cmd.rs2       := input(INSTRUCTION)(rs2Range).asUInt
      port.cmd.rs3       := input(INSTRUCTION)(rs3Range).asUInt
      port.cmd.rd        := input(INSTRUCTION)(rdRange).asUInt
-      port.cmd.format    := FpuFormat.FLOAT
+      port.cmd.format    := (if(p.withDouble) input(FPU_FORMAT) else FpuFormat.FLOAT())
      port.cmd.roundMode := roundMode.as(FpuRoundMode())

      insert(FPU_FORKED) := forked || port.cmd.fire
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@ -55,7 +55,7 @@ class FpuTest extends FunSuite{
  }

  def testP(p : FpuParameter){
-    val portCount = 1
+    val portCount = 4

    val config = SimConfig
    config.allOptimisation
@ -121,13 +121,13 @@ class FpuTest extends FunSuite{
          def f64_f64_i32 = {
            val str = next
            val s = new Scanner(str)
-            val a,b,c = (nextLong(s))
-            (b2d(a), b2d(b), c, s.nextInt(16))
+            val a,b = (nextLong(s))
+            (b2d(a), b2d(b), s.nextInt(16), s.nextInt(16))
          }

          def f64_f64 = {
            val s = new Scanner(next)
-            val a,b = (s.nextLong(16))
+            val a,b = nextLong(s)
            (b2d(a), b2d(b), s.nextInt(16))
          }

@ -501,6 +501,16 @@ class FpuTest extends FunSuite{
 //          if(ref + Float.MinPositiveValue*2.0f  === dut || dut + Float.MinPositiveValue*2.0f  === ref)
          false
        }
+
+        def checkDouble(ref : Double, dut : Double): Boolean ={
+          if((d2b(ref) & Long.MinValue) != (d2b(dut) & Long.MinValue)) return  false
+          if(ref == 0.0 && dut == 0.0 && d2b(ref) != d2b(dut)) return false
+          if(ref.isNaN && dut.isNaN) return true
+          if(ref == dut) return true
+          if(ref.abs * 1.0001 + Float.MinPositiveValue >= dut.abs*0.9999 && ref.abs * 0.9999 - Double.MinPositiveValue  <= dut.abs*1.0001) return true
+          //          if(ref + Float.MinPositiveValue*2.0f  === dut || dut + Float.MinPositiveValue*2.0f  === ref)
+          false
+        }
        def checkFloatExact(ref : Float, dut : Float): Boolean ={
          if(ref.signum != dut.signum === dut) return  false
          if(ref.isNaN && dut.isNaN) return true
@ -514,6 +524,11 @@ class FpuTest extends FunSuite{
          (Random.nextDouble() * (Math.pow(2.0, exp)) * (if(Random.nextBoolean()) -1.0 else 1.0)).toFloat
        }

+        def randomDouble(): Double ={
+          val exp = Random.nextInt(10)-5
+          (Random.nextDouble() * (Math.pow(2.0, exp)) * (if(Random.nextBoolean()) -1.0 else 1.0))
+        }
+

        def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
          val rs = new RegAllocator()
@ -538,7 +553,7 @@ class FpuTest extends FunSuite{
          load(rs2, b)
          op(rd,rs1,rs2, rounding, FpuFormat.DOUBLE)
          store(rd){v =>
-            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
+            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding, ${d2b(a).toString(16)} ${d2b(b).toString(16)} ${d2b(ref).toString(16)}")
          }

          flagMatch(flag, ref, f"## ${opName} ${a} $b $ref $rounding")
@ -609,7 +624,7 @@ class FpuTest extends FunSuite{
          store(rd){v =>
            assert(d2b(v) == d2b(ref), f"testCvtF32F64Raw $a $ref $rounding")
          }
-          flagMatch(flag, f"testCvtF32F64Raw $a $ref $rounding")
+          flagMatch(flag,ref, f"testCvtF32F64Raw $a $ref $rounding")
        }

        def testCvtF64F32Raw(a : Double, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={
@ -619,7 +634,7 @@ class FpuTest extends FunSuite{
          storeFloat(rd){v =>
            assert(d2b(v) == d2b(ref), f"testCvtF64F32Raw $a $ref $rounding")
          }
-          flagMatch(flag, f"testCvtF64F32Raw $a $ref $rounding")
+          flagMatch(flag, ref, f"testCvtF64F32Raw $a $ref $rounding")
        }


@ -646,6 +661,30 @@ class FpuTest extends FunSuite{
        }


+        def testClassF64Raw(a : Double) : Unit = {
+          val rd = Random.nextInt(32)
+
+
+          load(rd, a)
+          fclass(rd, FpuFormat.DOUBLE){v =>
+            val mantissa = d2b(a) & 0x000FFFFFFFFFFFFFl
+            val exp = (d2b(a) >> 52) & 0x7FF
+            val sign = (d2b(a) >> 63) & 0x1
+
+            val refBit = if(a.isInfinite) (if(sign == 0) 7 else 0)
+            else if(a.isNaN) (if((mantissa >> 51) != 0) 9 else 8)
+            else if(exp == 0 && mantissa != 0) (if(sign == 0) 5 else 2)
+            else if(exp == 0 && mantissa == 0) (if(sign == 0) 4 else 3)
+            else if(sign == 0) 6 else 1
+
+            val ref = 1 << refBit
+
+            assert(v == ref, f"fclass $a")
+          }
+        }
+
+
+
        def testFmaRaw(a : Float, b : Float, c : Float): Unit ={
          val rs = new RegAllocator()
          val rs1, rs2, rs3 = rs.allocate()
@ -663,6 +702,23 @@ class FpuTest extends FunSuite{
        }


+
+        def testFmaF64Raw(a : Double, b : Double, c : Double): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          load(rs3, c)
+
+          fma(rd,rs1,rs2,rs3, FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val ref = a.toDouble * b.toDouble + c.toDouble
+            val mul = a.toDouble * b.toDouble
+            if((mul.abs-c.abs)/mul.abs > 0.1)  assert(checkDouble(ref, v), f"$a%.20f * $b%.20f + $c%.20f = $v%.20f, $ref%.20f")
+          }
+        }
+
        def testSqrtExact(a : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={
          val rs = new RegAllocator()
          val rs1, rs2, rs3 = rs.allocate()
@ -690,6 +746,32 @@ class FpuTest extends FunSuite{
          }
        }

+        def testSqrtF64Exact(a : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+
+          sqrt(rd,rs1,  FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val error = Math.abs(ref-v)/ref
+            assert(checkDouble(ref, v), f"sqrt($a) = $v, $ref $error $rounding")
+          }
+        }
+
+        def testDivF64Exact(a : Double, b : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+
+          div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val error = Math.abs(ref-v)/ref
+            assert(checkDouble(ref, v), f"div($a, $b) = $v, $ref $error $rounding")
+          }
+        }


        def testF2iExact(a : Float, ref : Int, flag : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
@ -793,6 +875,23 @@ class FpuTest extends FunSuite{
        def testEqRaw(a : Float, b : Float, ref : Int, flag : Int) = testCmpExact(a,b,ref,flag, 2)
        def testLtRaw(a : Float, b : Float, ref : Int, flag : Int) = testCmpExact(a,b,ref,flag, 1)

+
+        def testCmpF64Exact(a : Double, b : Double, ref : Int, flag : Int, arg : Int): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          cmp(rs1, rs2, arg, FpuFormat.DOUBLE){rsp =>
+            val v = rsp.value.toBigInt.toInt
+            assert(v === ref, f"cmp($a, $b, $arg) = $v, $ref")
+          }
+          flagMatch(flag,f"$a < $b $ref $flag ${d2b(a)} ${d2b(b)}")
+        }
+        def testLeF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 0)
+        def testEqF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 2)
+        def testLtF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 1)
+
 //        def testFmv_x_w(a : Float): Unit ={
 //          val rs = new RegAllocator()
 //          val rs1, rs2, rs3 = rs.allocate()
@ -849,6 +948,35 @@ class FpuTest extends FunSuite{
        def testMaxExact(a : Float, b : Float) : Unit = testMinMaxExact(a,b,1)


+        def testMinMaxF64Exact(a : Double, b : Double, arg : Int): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2 = rs.allocate()
+          val rd = Random.nextInt(32)
+          val ref = (a,b) match {
+            case _ if a.isNaN && b.isNaN => b2d(0x7ff8000000000000l)
+            case _ if a.isNaN => b
+            case _ if b.isNaN => a
+            case _ => if(arg == 0) Math.min(a,b) else Math.max(a,b)
+          }
+          val flag = (a,b) match {
+            case _ if a.isNaN && ((d2b(a) >> 51 ) & 1) == 0 => 16
+            case _ if b.isNaN && ((d2b(b) >> 51 ) & 1) == 0 => 16
+            case _ => 0
+          }
+          load(rs1, a)
+          load(rs2, b)
+
+          minMax(rd,rs1,rs2, arg, FpuFormat.DOUBLE)
+          store(rd){v =>
+            assert(d2b(ref) ==  d2b(v), f"minMax($a $b $arg) = $v, $ref")
+          }
+          flagMatch(flag, f"minmax($a $b $arg)")
+        }
+
+        def testMinF64Exact(a : Double, b : Double) : Unit = testMinMaxF64Exact(a,b,0)
+        def testMaxF64Exact(a : Double, b : Double) : Unit = testMinMaxF64Exact(a,b,1)
+
+
        def testSgnjRaw(a : Float, b : Float): Unit ={
          val ref = b2f((f2b(a) & ~0x80000000) | f2b(b) & 0x80000000)
          testBinaryOp(sgnj,a,b,ref,0, null,"sgnj")
@ -862,6 +990,23 @@ class FpuTest extends FunSuite{
          testBinaryOp(sgnjx,a,b,ref,0, null,"sgnjx")
        }

+        val f64SignMask = 1l << 63
+        def testSgnjF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d((d2b(a).toLong & ~f64SignMask) | d2b(b).toLong & f64SignMask)
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnj,a,b,ref,0, null,"sgnj")
+        }
+        def testSgnjnF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d((d2b(a).toLong & ~f64SignMask) | ((d2b(b).toLong & f64SignMask) ^ f64SignMask))
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnjn,a,b,ref,0, null,"sgnjn")
+        }
+        def testSgnjxF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d(d2b(a).toLong ^ (d2b(b).toLong & f64SignMask))
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnjx,a,b,ref,0, null,"sgnjx")
+        }
+

        def withMinus(that : Seq[Float]) = that.flatMap(f => List(f, -f))
        val fZeros = withMinus(List(0.0f))
@ -887,25 +1032,46 @@ class FpuTest extends FunSuite{
          }
        }

-        def testFma() : Unit = {
+        def testFmaF32() : Unit = {
          testFmaRaw(randomFloat(), randomFloat(), randomFloat())
          flagClear()
        }

-        def testLe() : Unit = {
+
+        def testFmaF64() : Unit = {
+          testFmaF64Raw(randomDouble(), randomDouble(), randomDouble())
+          flagClear()
+        }
+
+        def testLeF32() : Unit = {
          val (a,b,i,f) = f32.le.RAW.f32_f32_i32
          testLeRaw(a,b,i, f)
        }
-        def testLt() : Unit = {
+        def testLtF32() : Unit = {
          val (a,b,i,f) = f32.lt.RAW.f32_f32_i32
          testLtRaw(a,b,i, f)
        }

-        def testEq() : Unit = {
+        def testEqF32() : Unit = {
          val (a,b,i,f) = f32.eq.RAW.f32_f32_i32
          testEqRaw(a,b,i, f)
        }

+        def testLeF64() : Unit = {
+          val (a,b,i,f) = f64.le.RAW.f64_f64_i32
+          testLeF64Raw(a,b,i, f)
+        }
+        def testLtF64() : Unit = {
+          val (a,b,i,f) = f64.lt.RAW.f64_f64_i32
+          testLtF64Raw(a,b,i, f)
+        }
+
+        def testEqF64() : Unit = {
+          val (a,b,i,f) = f64.eq.RAW.f64_f64_i32
+          testEqF64Raw(a,b,i, f)
+        }
+
+
        def testF2uiF32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,f) = f32.f2ui(rounding).f32_i32
@ -945,7 +1111,7 @@ class FpuTest extends FunSuite{
          flagClear()
        }

-        def testSgnj() : Unit = {
+        def testSgnjF32() : Unit = {
          testSgnjRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
          testSgnjnRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
          testSgnjxRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
@ -955,6 +1121,31 @@ class FpuTest extends FunSuite{
          testSgnjxRaw(a, b)
        }

+        def testDivF64() : Unit = {
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,r,f) = f64.div(rounding).f64_f64_f64
+          testDivF64Exact(a, b, r, f, rounding)
+          flagClear()
+        }
+
+        def testSqrtF64() : Unit = {
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,r,f) = f64.sqrt(rounding).f64_f64
+          testSqrtF64Exact(a, r, f, rounding)
+          flagClear()
+        }
+
+        def testSgnjF64() : Unit = {
+          testSgnjF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          testSgnjnF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          testSgnjxF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          val (a,b,r,f) = f64.sgnj.RAW.f64_f64_i32
+          testSgnjF64Raw(a, b)
+          testSgnjnF64Raw(a, b)
+          testSgnjxF64Raw(a, b)
+        }
+
+
        def testTransferF32() : Unit = {
          val (a,b,r,f) = f32.transfer.RAW.f32_f32_i32
          testTransferF32Raw(a, Random.nextBoolean(), Random.nextBoolean())
@ -985,20 +1176,35 @@ class FpuTest extends FunSuite{
          testCvtF64F32Raw(a, r, f, rounding)
        }

-        def testClass() : Unit = {
+        def testClassF32() : Unit = {
          val (a,b,r,f) = f32.fclass.RAW.f32_f32_i32
          testClassRaw(a)
        }

-        def testMin() : Unit = {
+        def testMinF32() : Unit = {
          val (a,b,r,f) = f32.min.RAW.f32_f32_f32
          testMinExact(a,b)
        }
-        def testMax() : Unit = {
+        def testMaxF32() : Unit = {
          val (a,b,r,f) = f32.max.RAW.f32_f32_f32
          testMaxExact(a,b)
        }

+        def testClassF64() : Unit = {
+          val (a,b,r,f) = f64.fclass.RAW.f64_f64_i32
+          testClassF64Raw(a)
+        }
+
+        def testMinF64() : Unit = {
+          val (a,b,r,f) = f64.min.RAW.f64_f64_f64
+          testMinF64Exact(a,b)
+        }
+        def testMaxF64() : Unit = {
+          val (a,b,r,f) = f64.max.RAW.f64_f64_f64
+          testMaxF64Exact(a,b)
+        }
+
+
        def testUI2f32() : Unit = {
          val rounding = FpuRoundMode.elements.randomPick()
          val (a,b,f) = f32.i2f(rounding).i32_f32
@ -1061,21 +1267,69 @@ class FpuTest extends FunSuite{
        }


-        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f32, testUI2f32, testMin, testMax, testSgnj, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLe, testEq, testLt, testClass, testFma)
+        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f32, testUI2f32, testMinF32, testMaxF32, testSgnjF32, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLeF32, testEqF32, testLtF32, testClassF32, testFmaF32)
+        val f64Tests = List[() => Unit](testSubF64, testAddF64, testMulF64, testI2f64, testUI2f64, testMinF64, testMaxF64, testSgnjF64, testTransferF64, testDiv, testSqrt, testF2iF64, testF2uiF64, testLeF64, testEqF64, testLtF64, testClassF64, testFmaF64, testCvtF32F64, testCvtF64F32)
+
+        var fxxTests = f32Tests
+        if(p.withDouble) fxxTests ++= f64Tests
+        
        
        //TODO test boxing
        //TODO double <-> simple convertions
        if(p.withDouble) {
+          for(_ <- 0 until 10000) testCvtF64F32() // 1 did not equal 3 Flag missmatch dut=1 ref=3 testCvtF64F32Raw 1.1754942807573643E-38 1.17549435E-38 RMM
+          println("FCVT_D_S done")
          for(_ <- 0 until 10000) testCvtF32F64()
          println("FCVT_S_D done")
-          for(_ <- 0 until 10000) testCvtF64F32()
-          println("FCVT_D_S done")
+
+          for(_ <- 0 until 10000) testF2iF64()
+          println("f64 f2i done")
+          for(_ <- 0 until 10000) testF2uiF64()
+          println("f64 f2ui done")
+
+
+          for(_ <- 0 until 10000) testSgnjF64()
+          println("f64 sgnj done")
+
+
+
+          for(_ <- 0 until 10000) testMinF64()
+          for(_ <- 0 until 10000) testMaxF64()
+          println("f64 minMax done")
+
+
+
+          for(i <- 0 until 1000) testFmaF64()
+          flagClear()
+          println("f64 fma done") //TODO
+
+
+          for(_ <- 0 until 10000) testLeF64()
+          for(_ <- 0 until 10000) testLtF64()
+          for(_ <- 0 until 10000) testEqF64()
+          println("f64 Cmp done")
+
+
+          for(_ <- 0 until 10000) testDivF64()
+          println("f64 div done")
+
+          for(_ <- 0 until 10000) testSqrtF64()
+          println("f64 sqrt done")
+
+          for(_ <- 0 until 10000) testClassF64()
+          println("f64 class done")
+//
+
+
+
+
+



          for(_ <- 0 until 10000) testAddF64()
          for(_ <- 0 until 10000) testSubF64()
-          println("Add done")
+          println("f64 Add done")


          //          testI2f64Exact(0x7FFFFFF5, 0x7FFFFFF5, 0, true, FpuRoundMode.RNE)
@ -1083,9 +1337,7 @@ class FpuTest extends FunSuite{
          for(_ <- 0 until 10000) testI2f64()
          println("f64 i2f done")

-          for(_ <- 0 until 10000) testF2uiF64()
-          for(_ <- 0 until 10000) testF2iF64()
-          println("f64 f2i done")
+

 //          testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
 //          testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
@ -1156,7 +1408,7 @@ class FpuTest extends FunSuite{



-        for(i <- 0 until 1000) testFma()
+        for(i <- 0 until 1000) testFmaF32()
        flagClear()
        println("fma done") //TODO

@ -1166,9 +1418,9 @@ class FpuTest extends FunSuite{
        testEqRaw(Float.PositiveInfinity,Float.PositiveInfinity,1, 0)
        testEqRaw(0f, 0f,1, 0)

-        for(_ <- 0 until 10000) testLe()
-        for(_ <- 0 until 10000) testLt()
-        for(_ <- 0 until 10000) testEq()
+        for(_ <- 0 until 10000) testLeF32()
+        for(_ <- 0 until 10000) testLtF32()
+        for(_ <- 0 until 10000) testEqF32()
        println("Cmp done")


@ -1178,16 +1430,16 @@ class FpuTest extends FunSuite{
        for(_ <- 0 until 10000) testSqrt()
        println("f32 sqrt done")

-        for(_ <- 0 until 10000) testSgnj()
+        for(_ <- 0 until 10000) testSgnjF32()
        println("f32 sgnj done")


-        for(_ <- 0 until 10000) testClass()
+        for(_ <- 0 until 10000) testClassF32()
        println("f32 class done")


-        for(_ <- 0 until 10000) testMin()
-        for(_ <- 0 until 10000) testMax()
+        for(_ <- 0 until 10000) testMinF32()
+        for(_ <- 0 until 10000) testMaxF32()
        println("minMax done")


@ -1229,11 +1481,13 @@ class FpuTest extends FunSuite{
 //        dut.clockDomain.waitSampling(1000)
 //        simSuccess()

-        for(i <- 0 until 1000) f32Tests.randomPick()()
+        for(i <- 0 until 10000) fxxTests.randomPick()()
        waitUntil(cpu.rspQueue.isEmpty)
      }


+
+
      stim.foreach(_.join())
      dut.clockDomain.waitSampling(100)
    }