diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
index f5350b0..c674bfe 100644
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -56,6 +56,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     val arg = p.Arg()
     val roundMode = FpuRoundMode()
     val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
   }
 
 
@@ -79,6 +80,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     val arg = Bits(2 bits)
     val roundMode = FpuRoundMode()
     val format = p.withDouble generate FpuFormat()
+    val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
   }
 
   case class MulInput() extends Bundle{
@@ -198,7 +200,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 
   //TODO nan boxing decoding
   val read = new Area{
-    val arbiter = StreamArbiterFactory.noLock.lowerFirst.build(FpuCmd(p), portCount)
+    val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
     arbiter.io.inputs <> Vec(io.port.map(_.cmd))
 
     val s0 = Stream(RfReadInput())
@@ -208,7 +210,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
 
     val useRs1, useRs2, useRs3, useRd = False
     switch(s0.opcode){
-      is(p.Opcode.LOAD)    {  useRd := True }
+      is(p.Opcode.LOAD)    { useRd := True }
       is(p.Opcode.STORE)   { useRs1 := True }
       is(p.Opcode.ADD)     { useRd  := True; useRs1 := True; useRs2 := True }
       is(p.Opcode.MUL)     { useRd  := True; useRs1 := True; useRs2 := True }
@@ -261,20 +263,25 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     output.rs2 := rs2Entry.value
     output.rs3 := rs3Entry.value
     if(p.withDouble){
+      output.rs1Boxed := rs1Entry.boxed
+      output.rs2Boxed := rs2Entry.boxed
       output.format := s1.format
       val store = s1.opcode === FpuOpcode.STORE ||s1.opcode === FpuOpcode.FMV_X_W
-      when(store){ //Pass through
-        output.format := rs1Entry.boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
-      } elsewhen(s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed){
-        output.rs1.setNanQuiet
-        output.rs1.sign := False
-      }
-      when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed){
-        output.rs2.setNanQuiet
-        output.rs2.sign := False
-      }
-      when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed){
-        output.rs3.setNanQuiet
+      val sgnjBypass = s1.opcode === FpuOpcode.SGNJ && s1.format === FpuFormat.DOUBLE
+      when(!sgnjBypass) {
+        when(store) { //Pass through
+          output.format := rs1Entry.boxed ? FpuFormat.FLOAT | FpuFormat.DOUBLE
+        } elsewhen (s1.format === FpuFormat.FLOAT =/= rs1Entry.boxed) {
+          output.rs1.setNanQuiet
+          output.rs1.sign := False
+        }
+        when(s1.format === FpuFormat.FLOAT =/= rs2Entry.boxed) {
+          output.rs2.setNanQuiet
+          output.rs2.sign := False
+        }
+        when(s1.format === FpuFormat.FLOAT =/= rs3Entry.boxed) {
+          output.rs3.setNanQuiet
+        }
       }
     }
   }
@@ -686,8 +693,11 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       )
       val result = (Mux(resign, ~unsigned, unsigned) + (resign ^ increment).asUInt)
       val overflow  = (input.rs1.exponent > (input.arg(0) ? U(exponentOne+30) | U(exponentOne+31)) || input.rs1.isInfinity) && !input.rs1.sign || input.rs1.isNan
-      val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && unsigned(30 downto 0) =/= 0 || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
+      val underflow = (input.rs1.exponent > U(exponentOne+31) || input.arg(0) && unsigned.msb && (unsigned(30 downto 0) =/= 0 || increment) || !input.arg(0) && (unsigned =/= 0 || increment) || input.rs1.isInfinity) && input.rs1.sign
       val isZero = input.rs1.isZero
+      if(p.withDouble){
+        overflow setWhen(!input.rs1.sign && increment && unsigned(30 downto 0).andR && (input.arg(0) || unsigned(31)))
+      }
       when(isZero){
         result := 0
       } elsewhen(underflow || overflow) {
@@ -720,7 +730,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     val minMaxSelectNanQuiet = input.rs1.isNan && input.rs2.isNan
     val cmpResult = B(rs1Smaller && !bothZero && !input.arg(1) || (rs1Equal || bothZero) && !input.arg(0))
     when(input.rs1.isNan || input.rs2.isNan) { cmpResult := 0 }
-    val sgnjResult = (input.rs1.sign && input.arg(1)) ^ input.rs2.sign ^ input.arg(0)
+    val sgnjRs1Sign = CombInit(input.rs1.sign)
+    val sgnjRs2Sign = CombInit(input.rs2.sign)
+    if(p.withDouble){
+      sgnjRs1Sign setWhen(input.rs1Boxed && input.format === FpuFormat.DOUBLE)
+      sgnjRs2Sign setWhen(input.rs2Boxed && input.format === FpuFormat.DOUBLE)
+    }
+    val sgnjResult = (sgnjRs1Sign && input.arg(1)) ^ sgnjRs2Sign ^ input.arg(0)
     val fclassResult = B(0, 32 bits)
     val decoded = input.rs1.decode()
     fclassResult(0) :=  input.rs1.sign &&  decoded.isInfinity
@@ -771,6 +787,22 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       }
       is(FpuOpcode.SGNJ){
         rfOutput.value.sign := sgnjResult
+        if(p.withDouble) when(input.format === FpuFormat.DOUBLE){
+          when(input.rs1Boxed){
+            rfOutput.value.sign := input.rs1.sign
+            rfOutput.format := FpuFormat.FLOAT
+          }
+//          //kill boxing => F32 -> F64 NAN
+//          when(input.rs1Boxed && !sgnjResult){
+//            rfOutput.value.setNan
+//            rfOutput.value.mantissa.setAll()
+//            rfOutput.value.mantissa(31 downto 0) := input.rs1.sign ## input.rs1.exponent
+//          }
+//          //Spawn boxing => F64 NAN -> F32
+//          when(!input.rs1Boxed && input.rs1.exponent === exponentOne + 1024 && input.rs1.mantissa(32, 52-32 bits).andR && sgnjResult){
+//
+//          }
+        }
       }
       if(p.withDouble) is(FpuOpcode.FCVT_X_X){
         rfOutput.format := ((input.format === FpuFormat.FLOAT) ? FpuFormat.DOUBLE | FpuFormat.FLOAT)
diff --git a/src/main/scala/vexriscv/plugin/FpuPlugin.scala b/src/main/scala/vexriscv/plugin/FpuPlugin.scala
index 468f14b..855c397 100644
--- a/src/main/scala/vexriscv/plugin/FpuPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/FpuPlugin.scala
@@ -17,6 +17,7 @@ class FpuPlugin(externalFpu : Boolean = false,
   object FPU_FORKED extends Stageable(Bool())
   object FPU_OPCODE extends Stageable(FpuOpcode())
   object FPU_ARG extends Stageable(Bits(2 bits))
+  object FPU_FORMAT extends Stageable(FpuFormat())
 
   var port : FpuPort = null
 
@@ -49,6 +50,7 @@ class FpuPlugin(externalFpu : Boolean = false,
     val fminMax = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.MIN_MAX
     val fmvWx   = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.FMV_W_X :+ RS1_USE -> True
     val fcvtI2f = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.I2F     :+ RS1_USE -> True
+    val fcvtxx  = floatRfWrite :+ FPU_OPCODE -> FpuOpcode.FCVT_X_X
 
     val fcmp    = intRfWrite   :+ FPU_OPCODE -> FpuOpcode.CMP
     val fclass  = intRfWrite   :+ FPU_OPCODE -> FpuOpcode.FCLASS
@@ -73,35 +75,69 @@ class FpuPlugin(externalFpu : Boolean = false,
     def arg(v : Int) = FPU_ARG -> U(v, 2 bits)
     val decoderService = pipeline.service(classOf[DecoderService])
     decoderService.addDefault(FPU_ENABLE, False)
+
+    val f32 = FPU_FORMAT -> FpuFormat.FLOAT
+    val f64 = FPU_FORMAT -> FpuFormat.DOUBLE
+
     decoderService.add(List(
-      FADD_S    -> (addSub :+ arg(0)),
-      FSUB_S    -> (addSub :+ arg(1)),
-      FMADD_S   -> (fma :+ arg(0)),
-      FMSUB_S   -> (fma :+ arg(2)),
-      FNMADD_S  -> (fma :+ arg(3)),
-      FNMSUB_S  -> (fma :+ arg(1)),
-      FMUL_S    -> (mul :+ arg(0)),
-      FDIV_S    -> (div),
-      FSQRT_S   -> (sqrt),
-      FLW       -> (fl),
-      FSW       -> (fs),
-      FCVT_S_WU -> (fcvtI2f :+ arg(0)),
-      FCVT_S_W  -> (fcvtI2f :+ arg(1)),
-      FCVT_WU_S -> (fcvtF2i :+ arg(0)),
-      FCVT_W_S ->  (fcvtF2i :+ arg(1)),
-      FCLASS_S  -> (fclass),
-      FLE_S     -> (fcmp :+ arg(0)),
-      FEQ_S     -> (fcmp :+ arg(2)),
-      FLT_S     -> (fcmp :+ arg(1)),
-      FSGNJ_S   -> (fsgnj :+ arg(0)),
-      FSGNJN_S  -> (fsgnj :+ arg(1)),
-      FSGNJX_S  -> (fsgnj :+ arg(2)),
-      FMIN_S    -> (fminMax :+ arg(0)),
-      FMAX_S    -> (fminMax :+ arg(1)),
-      FMV_X_W   -> (fmvXw),
-      FMV_W_X   -> (fmvWx)
+      FADD_S    -> (addSub  :+ f32 :+ arg(0)),
+      FSUB_S    -> (addSub  :+ f32 :+ arg(1)),
+      FMADD_S   -> (fma     :+ f32 :+ arg(0)),
+      FMSUB_S   -> (fma     :+ f32 :+ arg(2)),
+      FNMADD_S  -> (fma     :+ f32 :+ arg(3)),
+      FNMSUB_S  -> (fma     :+ f32 :+ arg(1)),
+      FMUL_S    -> (mul     :+ f32 :+ arg(0)),
+      FDIV_S    -> (div     :+ f32 ),
+      FSQRT_S   -> (sqrt    :+ f32 ),
+      FLW       -> (fl      :+ f32 ),
+      FSW       -> (fs      :+ f32 ),
+      FCVT_S_WU -> (fcvtI2f :+ f32 :+ arg(0)),
+      FCVT_S_W  -> (fcvtI2f :+ f32 :+ arg(1)),
+      FCVT_WU_S -> (fcvtF2i :+ f32 :+ arg(0)),
+      FCVT_W_S ->  (fcvtF2i :+ f32 :+ arg(1)),
+      FCLASS_S  -> (fclass  :+ f32 ),
+      FLE_S     -> (fcmp    :+ f32 :+ arg(0)),
+      FEQ_S     -> (fcmp    :+ f32 :+ arg(2)),
+      FLT_S     -> (fcmp    :+ f32 :+ arg(1)),
+      FSGNJ_S   -> (fsgnj   :+ f32 :+ arg(0)),
+      FSGNJN_S  -> (fsgnj   :+ f32 :+ arg(1)),
+      FSGNJX_S  -> (fsgnj   :+ f32 :+ arg(2)),
+      FMIN_S    -> (fminMax :+ f32 :+ arg(0)),
+      FMAX_S    -> (fminMax :+ f32 :+ arg(1)),
+      FMV_X_W   -> (fmvXw   :+ f32 ),
+      FMV_W_X   -> (fmvWx   :+ f32 )
     ))
 
+    if(p.withDouble){
+      decoderService.add(List(
+        FADD_D    -> (addSub  :+ f64 :+ arg(0)),
+        FSUB_D    -> (addSub  :+ f64 :+ arg(1)),
+        FMADD_D   -> (fma     :+ f64 :+ arg(0)),
+        FMSUB_D   -> (fma     :+ f64 :+ arg(2)),
+        FNMADD_D  -> (fma     :+ f64 :+ arg(3)),
+        FNMSUB_D  -> (fma     :+ f64 :+ arg(1)),
+        FMUL_D    -> (mul     :+ f64 :+ arg(0)),
+        FDIV_D    -> (div     :+ f64 ),
+        FSQRT_D   -> (sqrt    :+ f64 ),
+        FLW       -> (fl      :+ f64 ),
+        FSW       -> (fs      :+ f64 ),
+        FCVT_S_WU -> (fcvtI2f :+ f64 :+ arg(0)),
+        FCVT_S_W  -> (fcvtI2f :+ f64 :+ arg(1)),
+        FCVT_WU_D -> (fcvtF2i :+ f64 :+ arg(0)),
+        FCVT_W_D  -> (fcvtF2i :+ f64 :+ arg(1)),
+        FCLASS_D  -> (fclass  :+ f64 ),
+        FLE_D     -> (fcmp    :+ f64 :+ arg(0)),
+        FEQ_D     -> (fcmp    :+ f64 :+ arg(2)),
+        FLT_D     -> (fcmp    :+ f64 :+ arg(1)),
+        FSGNJ_D   -> (fsgnj   :+ f64 :+ arg(0)),
+        FSGNJN_D  -> (fsgnj   :+ f64 :+ arg(1)),
+        FSGNJX_D  -> (fsgnj   :+ f64 :+ arg(2)),
+        FMIN_D    -> (fminMax :+ f64 :+ arg(0)),
+        FMAX_D    -> (fminMax :+ f64 :+ arg(1)),
+        FCVT_D_S  -> (fcvtxx :+ f32),
+        FCVT_S_D  -> (fcvtxx :+ f64)
+      ))
+    }
     //TODO FMV_X_X + doubles
 
     port = FpuPort(p)
@@ -178,7 +214,7 @@ class FpuPlugin(externalFpu : Boolean = false,
       port.cmd.rs2       := input(INSTRUCTION)(rs2Range).asUInt
       port.cmd.rs3       := input(INSTRUCTION)(rs3Range).asUInt
       port.cmd.rd        := input(INSTRUCTION)(rdRange).asUInt
-      port.cmd.format    := FpuFormat.FLOAT
+      port.cmd.format    := (if(p.withDouble) input(FPU_FORMAT) else FpuFormat.FLOAT())
       port.cmd.roundMode := roundMode.as(FpuRoundMode())
 
       insert(FPU_FORKED) := forked || port.cmd.fire
diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
index 2e88265..43265f8 100644
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@@ -55,7 +55,7 @@ class FpuTest extends FunSuite{
   }
 
   def testP(p : FpuParameter){
-    val portCount = 1
+    val portCount = 4
 
     val config = SimConfig
     config.allOptimisation
@@ -121,13 +121,13 @@ class FpuTest extends FunSuite{
           def f64_f64_i32 = {
             val str = next
             val s = new Scanner(str)
-            val a,b,c = (nextLong(s))
-            (b2d(a), b2d(b), c, s.nextInt(16))
+            val a,b = (nextLong(s))
+            (b2d(a), b2d(b), s.nextInt(16), s.nextInt(16))
           }
 
           def f64_f64 = {
             val s = new Scanner(next)
-            val a,b = (s.nextLong(16))
+            val a,b = nextLong(s)
             (b2d(a), b2d(b), s.nextInt(16))
           }
 
@@ -501,6 +501,16 @@ class FpuTest extends FunSuite{
 //          if(ref + Float.MinPositiveValue*2.0f  === dut || dut + Float.MinPositiveValue*2.0f  === ref)
           false
         }
+
+        def checkDouble(ref : Double, dut : Double): Boolean ={
+          if((d2b(ref) & Long.MinValue) != (d2b(dut) & Long.MinValue)) return  false
+          if(ref == 0.0 && dut == 0.0 && d2b(ref) != d2b(dut)) return false
+          if(ref.isNaN && dut.isNaN) return true
+          if(ref == dut) return true
+          if(ref.abs * 1.0001 + Float.MinPositiveValue >= dut.abs*0.9999 && ref.abs * 0.9999 - Double.MinPositiveValue  <= dut.abs*1.0001) return true
+          //          if(ref + Float.MinPositiveValue*2.0f  === dut || dut + Float.MinPositiveValue*2.0f  === ref)
+          false
+        }
         def checkFloatExact(ref : Float, dut : Float): Boolean ={
           if(ref.signum != dut.signum === dut) return  false
           if(ref.isNaN && dut.isNaN) return true
@@ -514,6 +524,11 @@ class FpuTest extends FunSuite{
           (Random.nextDouble() * (Math.pow(2.0, exp)) * (if(Random.nextBoolean()) -1.0 else 1.0)).toFloat
         }
 
+        def randomDouble(): Double ={
+          val exp = Random.nextInt(10)-5
+          (Random.nextDouble() * (Math.pow(2.0, exp)) * (if(Random.nextBoolean()) -1.0 else 1.0))
+        }
+
 
         def testBinaryOp(op : (Int,Int,Int,FpuRoundMode.E, FpuFormat.E) => Unit, a : Float, b : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E, opName : String): Unit ={
           val rs = new RegAllocator()
@@ -538,7 +553,7 @@ class FpuTest extends FunSuite{
           load(rs2, b)
           op(rd,rs1,rs2, rounding, FpuFormat.DOUBLE)
           store(rd){v =>
-            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding")
+            assert(d2b(v) == d2b(ref), f"## ${a}  ${opName}  $b = $v, $ref $rounding, ${d2b(a).toString(16)} ${d2b(b).toString(16)} ${d2b(ref).toString(16)}")
           }
 
           flagMatch(flag, ref, f"## ${opName} ${a} $b $ref $rounding")
@@ -609,7 +624,7 @@ class FpuTest extends FunSuite{
           store(rd){v =>
             assert(d2b(v) == d2b(ref), f"testCvtF32F64Raw $a $ref $rounding")
           }
-          flagMatch(flag, f"testCvtF32F64Raw $a $ref $rounding")
+          flagMatch(flag,ref, f"testCvtF32F64Raw $a $ref $rounding")
         }
 
         def testCvtF64F32Raw(a : Double, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={
@@ -619,7 +634,7 @@ class FpuTest extends FunSuite{
           storeFloat(rd){v =>
             assert(d2b(v) == d2b(ref), f"testCvtF64F32Raw $a $ref $rounding")
           }
-          flagMatch(flag, f"testCvtF64F32Raw $a $ref $rounding")
+          flagMatch(flag, ref, f"testCvtF64F32Raw $a $ref $rounding")
         }
 
 
@@ -646,6 +661,30 @@ class FpuTest extends FunSuite{
         }
 
 
+        def testClassF64Raw(a : Double) : Unit = {
+          val rd = Random.nextInt(32)
+
+
+          load(rd, a)
+          fclass(rd, FpuFormat.DOUBLE){v =>
+            val mantissa = d2b(a) & 0x000FFFFFFFFFFFFFl
+            val exp = (d2b(a) >> 52) & 0x7FF
+            val sign = (d2b(a) >> 63) & 0x1
+
+            val refBit = if(a.isInfinite) (if(sign == 0) 7 else 0)
+            else if(a.isNaN) (if((mantissa >> 51) != 0) 9 else 8)
+            else if(exp == 0 && mantissa != 0) (if(sign == 0) 5 else 2)
+            else if(exp == 0 && mantissa == 0) (if(sign == 0) 4 else 3)
+            else if(sign == 0) 6 else 1
+
+            val ref = 1 << refBit
+
+            assert(v == ref, f"fclass $a")
+          }
+        }
+
+
+
         def testFmaRaw(a : Float, b : Float, c : Float): Unit ={
           val rs = new RegAllocator()
           val rs1, rs2, rs3 = rs.allocate()
@@ -663,6 +702,23 @@ class FpuTest extends FunSuite{
         }
 
 
+
+        def testFmaF64Raw(a : Double, b : Double, c : Double): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          load(rs3, c)
+
+          fma(rd,rs1,rs2,rs3, FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val ref = a.toDouble * b.toDouble + c.toDouble
+            val mul = a.toDouble * b.toDouble
+            if((mul.abs-c.abs)/mul.abs > 0.1)  assert(checkDouble(ref, v), f"$a%.20f * $b%.20f + $c%.20f = $v%.20f, $ref%.20f")
+          }
+        }
+
         def testSqrtExact(a : Float, ref : Float, flag : Int, rounding : FpuRoundMode.E): Unit ={
           val rs = new RegAllocator()
           val rs1, rs2, rs3 = rs.allocate()
@@ -690,6 +746,32 @@ class FpuTest extends FunSuite{
           }
         }
 
+        def testSqrtF64Exact(a : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+
+          sqrt(rd,rs1,  FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val error = Math.abs(ref-v)/ref
+            assert(checkDouble(ref, v), f"sqrt($a) = $v, $ref $error $rounding")
+          }
+        }
+
+        def testDivF64Exact(a : Double, b : Double, ref : Double, flag : Int, rounding : FpuRoundMode.E): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+
+          div(rd,rs1, rs2, FpuRoundMode.RNE, FpuFormat.DOUBLE)
+          store(rd){v =>
+            val error = Math.abs(ref-v)/ref
+            assert(checkDouble(ref, v), f"div($a, $b) = $v, $ref $error $rounding")
+          }
+        }
 
 
         def testF2iExact(a : Float, ref : Int, flag : Int, signed : Boolean, rounding : FpuRoundMode.E): Unit ={
@@ -793,6 +875,23 @@ class FpuTest extends FunSuite{
         def testEqRaw(a : Float, b : Float, ref : Int, flag : Int) = testCmpExact(a,b,ref,flag, 2)
         def testLtRaw(a : Float, b : Float, ref : Int, flag : Int) = testCmpExact(a,b,ref,flag, 1)
 
+
+        def testCmpF64Exact(a : Double, b : Double, ref : Int, flag : Int, arg : Int): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2, rs3 = rs.allocate()
+          val rd = Random.nextInt(32)
+          load(rs1, a)
+          load(rs2, b)
+          cmp(rs1, rs2, arg, FpuFormat.DOUBLE){rsp =>
+            val v = rsp.value.toBigInt.toInt
+            assert(v === ref, f"cmp($a, $b, $arg) = $v, $ref")
+          }
+          flagMatch(flag,f"$a < $b $ref $flag ${d2b(a)} ${d2b(b)}")
+        }
+        def testLeF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 0)
+        def testEqF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 2)
+        def testLtF64Raw(a : Double, b : Double, ref : Int, flag : Int) = testCmpF64Exact(a,b,ref,flag, 1)
+
 //        def testFmv_x_w(a : Float): Unit ={
 //          val rs = new RegAllocator()
 //          val rs1, rs2, rs3 = rs.allocate()
@@ -849,6 +948,35 @@ class FpuTest extends FunSuite{
         def testMaxExact(a : Float, b : Float) : Unit = testMinMaxExact(a,b,1)
 
 
+        def testMinMaxF64Exact(a : Double, b : Double, arg : Int): Unit ={
+          val rs = new RegAllocator()
+          val rs1, rs2 = rs.allocate()
+          val rd = Random.nextInt(32)
+          val ref = (a,b) match {
+            case _ if a.isNaN && b.isNaN => b2d(0x7ff8000000000000l)
+            case _ if a.isNaN => b
+            case _ if b.isNaN => a
+            case _ => if(arg == 0) Math.min(a,b) else Math.max(a,b)
+          }
+          val flag = (a,b) match {
+            case _ if a.isNaN && ((d2b(a) >> 51 ) & 1) == 0 => 16
+            case _ if b.isNaN && ((d2b(b) >> 51 ) & 1) == 0 => 16
+            case _ => 0
+          }
+          load(rs1, a)
+          load(rs2, b)
+
+          minMax(rd,rs1,rs2, arg, FpuFormat.DOUBLE)
+          store(rd){v =>
+            assert(d2b(ref) ==  d2b(v), f"minMax($a $b $arg) = $v, $ref")
+          }
+          flagMatch(flag, f"minmax($a $b $arg)")
+        }
+
+        def testMinF64Exact(a : Double, b : Double) : Unit = testMinMaxF64Exact(a,b,0)
+        def testMaxF64Exact(a : Double, b : Double) : Unit = testMinMaxF64Exact(a,b,1)
+
+
         def testSgnjRaw(a : Float, b : Float): Unit ={
           val ref = b2f((f2b(a) & ~0x80000000) | f2b(b) & 0x80000000)
           testBinaryOp(sgnj,a,b,ref,0, null,"sgnj")
@@ -862,6 +990,23 @@ class FpuTest extends FunSuite{
           testBinaryOp(sgnjx,a,b,ref,0, null,"sgnjx")
         }
 
+        val f64SignMask = 1l << 63
+        def testSgnjF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d((d2b(a).toLong & ~f64SignMask) | d2b(b).toLong & f64SignMask)
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnj,a,b,ref,0, null,"sgnj")
+        }
+        def testSgnjnF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d((d2b(a).toLong & ~f64SignMask) | ((d2b(b).toLong & f64SignMask) ^ f64SignMask))
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnjn,a,b,ref,0, null,"sgnjn")
+        }
+        def testSgnjxF64Raw(a : Double, b : Double): Unit ={
+          var ref = b2d(d2b(a).toLong ^ (d2b(b).toLong & f64SignMask))
+          if(d2b(a).toLong >> 32 == -1) ref = a
+          testBinaryOpF64(sgnjx,a,b,ref,0, null,"sgnjx")
+        }
+
 
         def withMinus(that : Seq[Float]) = that.flatMap(f => List(f, -f))
         val fZeros = withMinus(List(0.0f))
@@ -887,25 +1032,46 @@ class FpuTest extends FunSuite{
           }
         }
 
-        def testFma() : Unit = {
+        def testFmaF32() : Unit = {
           testFmaRaw(randomFloat(), randomFloat(), randomFloat())
           flagClear()
         }
 
-        def testLe() : Unit = {
+
+        def testFmaF64() : Unit = {
+          testFmaF64Raw(randomDouble(), randomDouble(), randomDouble())
+          flagClear()
+        }
+
+        def testLeF32() : Unit = {
           val (a,b,i,f) = f32.le.RAW.f32_f32_i32
           testLeRaw(a,b,i, f)
         }
-        def testLt() : Unit = {
+        def testLtF32() : Unit = {
           val (a,b,i,f) = f32.lt.RAW.f32_f32_i32
           testLtRaw(a,b,i, f)
         }
 
-        def testEq() : Unit = {
+        def testEqF32() : Unit = {
           val (a,b,i,f) = f32.eq.RAW.f32_f32_i32
           testEqRaw(a,b,i, f)
         }
 
+        def testLeF64() : Unit = {
+          val (a,b,i,f) = f64.le.RAW.f64_f64_i32
+          testLeF64Raw(a,b,i, f)
+        }
+        def testLtF64() : Unit = {
+          val (a,b,i,f) = f64.lt.RAW.f64_f64_i32
+          testLtF64Raw(a,b,i, f)
+        }
+
+        def testEqF64() : Unit = {
+          val (a,b,i,f) = f64.eq.RAW.f64_f64_i32
+          testEqF64Raw(a,b,i, f)
+        }
+
+
         def testF2uiF32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.f2ui(rounding).f32_i32
@@ -945,7 +1111,7 @@ class FpuTest extends FunSuite{
           flagClear()
         }
 
-        def testSgnj() : Unit = {
+        def testSgnjF32() : Unit = {
           testSgnjRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
           testSgnjnRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
           testSgnjxRaw(b2f(Random.nextInt()), b2f(Random.nextInt()))
@@ -955,6 +1121,31 @@ class FpuTest extends FunSuite{
           testSgnjxRaw(a, b)
         }
 
+        def testDivF64() : Unit = {
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,b,r,f) = f64.div(rounding).f64_f64_f64
+          testDivF64Exact(a, b, r, f, rounding)
+          flagClear()
+        }
+
+        def testSqrtF64() : Unit = {
+          val rounding = FpuRoundMode.elements.randomPick()
+          val (a,r,f) = f64.sqrt(rounding).f64_f64
+          testSqrtF64Exact(a, r, f, rounding)
+          flagClear()
+        }
+
+        def testSgnjF64() : Unit = {
+          testSgnjF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          testSgnjnF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          testSgnjxF64Raw(b2d(Random.nextLong()), b2d(Random.nextLong()))
+          val (a,b,r,f) = f64.sgnj.RAW.f64_f64_i32
+          testSgnjF64Raw(a, b)
+          testSgnjnF64Raw(a, b)
+          testSgnjxF64Raw(a, b)
+        }
+
+
         def testTransferF32() : Unit = {
           val (a,b,r,f) = f32.transfer.RAW.f32_f32_i32
           testTransferF32Raw(a, Random.nextBoolean(), Random.nextBoolean())
@@ -985,20 +1176,35 @@ class FpuTest extends FunSuite{
           testCvtF64F32Raw(a, r, f, rounding)
         }
 
-        def testClass() : Unit = {
+        def testClassF32() : Unit = {
           val (a,b,r,f) = f32.fclass.RAW.f32_f32_i32
           testClassRaw(a)
         }
 
-        def testMin() : Unit = {
+        def testMinF32() : Unit = {
           val (a,b,r,f) = f32.min.RAW.f32_f32_f32
           testMinExact(a,b)
         }
-        def testMax() : Unit = {
+        def testMaxF32() : Unit = {
           val (a,b,r,f) = f32.max.RAW.f32_f32_f32
           testMaxExact(a,b)
         }
 
+        def testClassF64() : Unit = {
+          val (a,b,r,f) = f64.fclass.RAW.f64_f64_i32
+          testClassF64Raw(a)
+        }
+
+        def testMinF64() : Unit = {
+          val (a,b,r,f) = f64.min.RAW.f64_f64_f64
+          testMinF64Exact(a,b)
+        }
+        def testMaxF64() : Unit = {
+          val (a,b,r,f) = f64.max.RAW.f64_f64_f64
+          testMaxF64Exact(a,b)
+        }
+
+
         def testUI2f32() : Unit = {
           val rounding = FpuRoundMode.elements.randomPick()
           val (a,b,f) = f32.i2f(rounding).i32_f32
@@ -1061,21 +1267,69 @@ class FpuTest extends FunSuite{
         }
 
 
-        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f32, testUI2f32, testMin, testMax, testSgnj, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLe, testEq, testLt, testClass, testFma)
+        val f32Tests = List[() => Unit](testSubF32, testAddF32, testMulF32, testI2f32, testUI2f32, testMinF32, testMaxF32, testSgnjF32, testTransferF32, testDiv, testSqrt, testF2iF32, testF2uiF32, testLeF32, testEqF32, testLtF32, testClassF32, testFmaF32)
+        val f64Tests = List[() => Unit](testSubF64, testAddF64, testMulF64, testI2f64, testUI2f64, testMinF64, testMaxF64, testSgnjF64, testTransferF64, testDiv, testSqrt, testF2iF64, testF2uiF64, testLeF64, testEqF64, testLtF64, testClassF64, testFmaF64, testCvtF32F64, testCvtF64F32)
 
+        var fxxTests = f32Tests
+        if(p.withDouble) fxxTests ++= f64Tests
+        
+        
         //TODO test boxing
         //TODO double <-> simple convertions
         if(p.withDouble) {
+          for(_ <- 0 until 10000) testCvtF64F32() // 1 did not equal 3 Flag missmatch dut=1 ref=3 testCvtF64F32Raw 1.1754942807573643E-38 1.17549435E-38 RMM
+          println("FCVT_D_S done")
           for(_ <- 0 until 10000) testCvtF32F64()
           println("FCVT_S_D done")
-          for(_ <- 0 until 10000) testCvtF64F32()
-          println("FCVT_D_S done")
+
+          for(_ <- 0 until 10000) testF2iF64()
+          println("f64 f2i done")
+          for(_ <- 0 until 10000) testF2uiF64()
+          println("f64 f2ui done")
+
+
+          for(_ <- 0 until 10000) testSgnjF64()
+          println("f64 sgnj done")
+
+
+
+          for(_ <- 0 until 10000) testMinF64()
+          for(_ <- 0 until 10000) testMaxF64()
+          println("f64 minMax done")
+
+
+
+          for(i <- 0 until 1000) testFmaF64()
+          flagClear()
+          println("f64 fma done") //TODO
+
+
+          for(_ <- 0 until 10000) testLeF64()
+          for(_ <- 0 until 10000) testLtF64()
+          for(_ <- 0 until 10000) testEqF64()
+          println("f64 Cmp done")
+
+
+          for(_ <- 0 until 10000) testDivF64()
+          println("f64 div done")
+
+          for(_ <- 0 until 10000) testSqrtF64()
+          println("f64 sqrt done")
+
+          for(_ <- 0 until 10000) testClassF64()
+          println("f64 class done")
+//
+
+
+
+
+
 
 
 
           for(_ <- 0 until 10000) testAddF64()
           for(_ <- 0 until 10000) testSubF64()
-          println("Add done")
+          println("f64 Add done")
 
 
           //          testI2f64Exact(0x7FFFFFF5, 0x7FFFFFF5, 0, true, FpuRoundMode.RNE)
@@ -1083,9 +1337,7 @@ class FpuTest extends FunSuite{
           for(_ <- 0 until 10000) testI2f64()
           println("f64 i2f done")
 
-          for(_ <- 0 until 10000) testF2uiF64()
-          for(_ <- 0 until 10000) testF2iF64()
-          println("f64 f2i done")
+
 
 //          testF2iExact(1.0f,1, 0, false, FpuRoundMode.RTZ)
 //          testF2iExact(2.0f,2, 0, false, FpuRoundMode.RTZ)
@@ -1156,7 +1408,7 @@ class FpuTest extends FunSuite{
 
 
 
-        for(i <- 0 until 1000) testFma()
+        for(i <- 0 until 1000) testFmaF32()
         flagClear()
         println("fma done") //TODO
 
@@ -1166,9 +1418,9 @@ class FpuTest extends FunSuite{
         testEqRaw(Float.PositiveInfinity,Float.PositiveInfinity,1, 0)
         testEqRaw(0f, 0f,1, 0)
 
-        for(_ <- 0 until 10000) testLe()
-        for(_ <- 0 until 10000) testLt()
-        for(_ <- 0 until 10000) testEq()
+        for(_ <- 0 until 10000) testLeF32()
+        for(_ <- 0 until 10000) testLtF32()
+        for(_ <- 0 until 10000) testEqF32()
         println("Cmp done")
 
 
@@ -1178,16 +1430,16 @@ class FpuTest extends FunSuite{
         for(_ <- 0 until 10000) testSqrt()
         println("f32 sqrt done")
 
-        for(_ <- 0 until 10000) testSgnj()
+        for(_ <- 0 until 10000) testSgnjF32()
         println("f32 sgnj done")
 
 
-        for(_ <- 0 until 10000) testClass()
+        for(_ <- 0 until 10000) testClassF32()
         println("f32 class done")
 
 
-        for(_ <- 0 until 10000) testMin()
-        for(_ <- 0 until 10000) testMax()
+        for(_ <- 0 until 10000) testMinF32()
+        for(_ <- 0 until 10000) testMaxF32()
         println("minMax done")
 
 
@@ -1229,11 +1481,13 @@ class FpuTest extends FunSuite{
 //        dut.clockDomain.waitSampling(1000)
 //        simSuccess()
 
-        for(i <- 0 until 1000) f32Tests.randomPick()()
+        for(i <- 0 until 10000) fxxTests.randomPick()()
         waitUntil(cpu.rspQueue.isEmpty)
       }
 
 
+
+
       stim.foreach(_.join())
       dut.clockDomain.waitSampling(100)
     }