From 81c193af1ff28021a4cb2979f17bce456b5f2be2 Mon Sep 17 00:00:00 2001
From: Dolu1990 <charles.papon.90@gmail.com>
Date: Fri, 26 Feb 2021 16:32:42 +0100
Subject: [PATCH] Improve subnormal/normal rounding

---
 src/main/scala/vexriscv/ip/fpu/FpuCore.scala |  74 ++++++---
 src/test/cpp/fpu/math/fpu_math.c             |  23 +++
 src/test/java/vexriscv/ip/fpu/FpuMath.java   |   3 +
 src/test/scala/vexriscv/ip/fpu/FpuTest.scala | 166 ++++++++++++-------
 4 files changed, 184 insertions(+), 82 deletions(-)

diff --git a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
index 6c73d83..76cbac0 100644
--- a/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
+++ b/src/main/scala/vexriscv/ip/fpu/FpuCore.scala
@@ -1004,7 +1004,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       output.roundMode := input.roundMode
       output.scrap := norm.scrap
       output.value := norm.output
-      output.NV := NV //TODO isn't propagated in FMA
+      output.NV := NV
       output.DZ := False
 
       decode.mulToAdd.valid := input.valid && input.add
@@ -1019,6 +1019,10 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
       decode.mulToAdd.roundMode := input.roundMode
       if (p.withDouble) decode.mulToAdd.format := input.format
 
+      when(NV){
+        decode.mulToAdd.rs1.mantissa.msb := False
+      }
+
       input.ready := (input.add ? decode.mulToAdd.ready | output.ready) || input.divSqrt
     }
   }
@@ -1558,7 +1562,23 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
     val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
     val ofThreshold = muxDouble[UInt](input.format)(exponentF64Infinity-1)(exponentF32Infinity-1)
 
-    when(!math.special && math.exponent <= ufSubnormalThreshold && roundAdjusted.asUInt =/= 0){ //Do not catch exact 1.17549435E-38 underflow, but, who realy care ?
+    //catch exact 1.17549435E-38 underflow, but, who realy care ?
+//    val borringCase = input.value.exponent === ufSubnormalThreshold && roundAdjusted.asUInt < U"11"
+//    when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
+//      uf := True
+//    }
+    val threshold = input.roundMode.mux(
+      FpuRoundMode.RNE -> U"110",
+      FpuRoundMode.RTZ -> U"110",
+      FpuRoundMode.RDN -> (input.value.sign ? U"101" | U"111"),
+      FpuRoundMode.RUP -> (input.value.sign ? U"111" | U"101"),
+      FpuRoundMode.RMM -> U"110"
+    )
+    val borringRound = (input.value.mantissa(1 downto 0) ## input.scrap)
+    if(p.withDouble) when(input.format === FpuFormat.FLOAT) { borringRound := (input.value.mantissa(30 downto 29) ## input.value.mantissa(28 downto 0).orR)}
+
+    val borringCase = input.value.exponent === ufSubnormalThreshold && borringRound.asUInt < threshold
+    when(!math.special && (math.exponent <= ufSubnormalThreshold || borringCase) && roundAdjusted.asUInt =/= 0){
       uf := True
     }
     when(!math.special && math.exponent > ofThreshold){
@@ -1840,6 +1860,11 @@ cat all1.txt | grep "Errors found in"
 testfloat  -tininessafter -all2 > all2.txt
 cat all2.txt | grep "Errors found in"
 
+testfloat  -tininessafter -f32_mulAdd > fma.txt
+
+testfloat  -tininessafter -all2  -level 2 -checkall  > all2.txt
+
+
 
 all1 =>
 Errors found in f32_to_ui64_rx_minMag:
@@ -1848,28 +1873,29 @@ Errors found in f64_to_ui64_rx_minMag:
 Errors found in f64_to_i64_rx_minMag:
 
 all2 =>
-Errors found in f32_add, rounding near_even:
-Errors found in f32_add, rounding minMag:
-Errors found in f32_add, rounding min:
-Errors found in f32_add, rounding max:
-Errors found in f32_sub, rounding near_even:
-Errors found in f32_sub, rounding minMag:
-Errors found in f32_sub, rounding min:
-Errors found in f32_sub, rounding max:
-Errors found in f32_mul, rounding near_even:
-Errors found in f32_mul, rounding min:
-Errors found in f32_mul, rounding max:
-Errors found in f32_div, rounding near_even:
-Errors found in f32_div, rounding minMag:
-Errors found in f32_div, rounding min:
-Errors found in f32_div, rounding max:
-Errors found in f64_mul, rounding near_even:
-Errors found in f64_mul, rounding min:
-Errors found in f64_mul, rounding max:
-Errors found in f64_div, rounding near_even:
-Errors found in f64_div, rounding minMag:
-Errors found in f64_div, rounding min:
-Errors found in f64_div, rounding max:
+
+
+Errors found in f32_mulAdd, rounding min:
++00.7FFFFF  +67.000001  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
++67.000001  +00.7FFFFF  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+-00.7FFFFF  -67.000001  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+-67.000001  -00.7FFFFF  -01.000000
+        => -01.000000 ...ux  expected -01.000000 ....x
+Errors found in f32_mulAdd, rounding max:
++00.7FFFFF  -67.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
++67.000001  -00.7FFFFF  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
++66.7FFFFE  -01.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+-00.7FFFFF  +67.000001  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+-67.000001  +00.7FFFFF  +01.000000
+        => +01.000000 ...ux  expected +01.000000 ....x
+
 
 
  */
\ No newline at end of file
diff --git a/src/test/cpp/fpu/math/fpu_math.c b/src/test/cpp/fpu/math/fpu_math.c
index ee0b3c0..da45fe2 100644
--- a/src/test/cpp/fpu/math/fpu_math.c
+++ b/src/test/cpp/fpu/math/fpu_math.c
@@ -43,6 +43,10 @@ void applyRounding(int rounding){
 #define toF32(v) (*((float32_t*)&v))
 #define fromF32(x) (*((float*)&(x.v)))
 
+
+#define toF64(v) (*((float64_t*)&v))
+#define fromF64(x) (*((double*)&(x.v)))
+
 JNIEXPORT jfloat API JNICALL Java_vexriscv_ip_fpu_FpuMath_addF32(JNIEnv * env, jobject obj, jfloat a, jfloat b, jint rounding){
     applyRounding(rounding);
     float32_t v = f32_add(toF32(a), toF32(b));
@@ -53,4 +57,23 @@ JNIEXPORT jfloat API JNICALL Java_vexriscv_ip_fpu_FpuMath_mulF32(JNIEnv * env, j
     applyRounding(rounding);
     float32_t v = f32_mul(toF32(a), toF32(b));
     return fromF32(v);
+}
+JNIEXPORT jint API JNICALL Java_vexriscv_ip_fpu_FpuMath_mulFlagF32(JNIEnv * env, jobject obj, jfloat a, jfloat b, jint rounding){
+    applyRounding(rounding);
+    softfloat_exceptionFlags = 0;
+    float32_t v = f32_mul(toF32(a), toF32(b));
+    return softfloat_exceptionFlags;
+}
+
+
+JNIEXPORT jfloat API JNICALL Java_vexriscv_ip_fpu_FpuMath_d2f(JNIEnv * env, jobject obj, jdouble a,  jint rounding){
+    applyRounding(rounding);
+    float32_t v = f64_to_f32(toF64(a));
+    return fromF32(v);
+}
+JNIEXPORT jint API JNICALL Java_vexriscv_ip_fpu_FpuMath_d2fFlag(JNIEnv * env, jobject obj, jdouble a, jint rounding){
+    applyRounding(rounding);
+    softfloat_exceptionFlags = 0;
+    float32_t v = f64_to_f32(toF64(a));
+    return softfloat_exceptionFlags;
 }
\ No newline at end of file
diff --git a/src/test/java/vexriscv/ip/fpu/FpuMath.java b/src/test/java/vexriscv/ip/fpu/FpuMath.java
index 88da007..51f9502 100644
--- a/src/test/java/vexriscv/ip/fpu/FpuMath.java
+++ b/src/test/java/vexriscv/ip/fpu/FpuMath.java
@@ -5,6 +5,9 @@ import java.io.File;
 public class FpuMath {
     public native float addF32(float a, float b, int rounding);
     public native float mulF32(float a, float b, int rounding);
+    public native int mulFlagF32(float a, float b, int rounding);
+    public native float d2f(double a, int rounding);
+    public native int d2fFlag(double a, int rounding);
 
     static{
         System.load(new File("src/test/cpp/fpu/math/fpu_math.so").getAbsolutePath());
diff --git a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
index ddc53a8..6d4c1fe 100644
--- a/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
+++ b/src/test/scala/vexriscv/ip/fpu/FpuTest.scala
@@ -59,7 +59,7 @@ class FpuTest extends FunSuite{
 
     val config = SimConfig
     config.allOptimisation
-    config.withFstWave
+//    config.withFstWave
     config.compile(new FpuCore(portCount, p){
       for(i <- 0 until portCount) out(Bits(5 bits)).setName(s"flagAcc$i") := io.port(i).completion.flags.asBits
       setDefinitionName("FpuCore"+ (if(p.withDouble) "Double" else  ""))
@@ -206,18 +206,18 @@ class FpuTest extends FunSuite{
 
         def softAssert(cond : Boolean, msg : String) = if(!cond)println(msg)
         def flagMatch(ref : Int, value : Float, report : String): Unit ={
-          val patch = if(value.abs == 1.17549435E-38f) ref & ~2 else ref
-          flagMatch(patch, report)
+          val patch = if(value.abs == 1.17549435E-38f && false) 0x1f & ~2 else 0x1f
+          flagMatch(ref, report, patch)
         }
 
         def flagMatch(ref : Int, value : Double, report : String): Unit ={
-          val patch = if(value.abs == b2d(1 << 52)) ref & ~2 else ref
-          flagMatch(patch, report)
+          val patch = if(value.abs == b2d(1 << 52) && false) 0x1f & ~2 else 0x1f
+          flagMatch(ref, report, patch)
         }
 
-        def flagMatch(ref : Int, report : String): Unit ={
+        def flagMatch(ref : Int, report : String, mask : Int = 0x1f): Unit ={
           waitUntil(pendingMiaou == 0)
-          assert(flagAccumulator == ref, s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
+          assert((flagAccumulator & mask) == (ref & mask), s"Flag missmatch dut=$flagAccumulator ref=$ref $report")
           flagAccumulator = 0
         }
         def flagClear(): Unit ={
@@ -1257,26 +1257,68 @@ class FpuTest extends FunSuite{
         var fxxTests = f32Tests
         if(p.withDouble) fxxTests ++= f64Tests
 
+//5071920 5225560
+//        for(v <- List(-1.17549435082e-38f, 1.17549435082e-38f);
+//            rounding <- FpuRoundMode.elements) {
+//          for (i <- 0 until 2048) {
+//            val b = d2b(v)// 0x0010000000000000l //d2b(1.17549435082e-38)
+//            val s = (b - (i.toLong << 21)).toLong
+//            val d = b2d(s)
+////            val rounding = FpuRoundMode.RNE
+//            testCvtF64F32Raw(d, Clib.math.d2f(d, rounding.position), Clib.math.d2fFlag(d, rounding.position), rounding)
+//          }
+//        }
+//
+//
+//        testCvtF64F32Raw(-1.1754943508051483E-38, -1.17549435E-38f, 1, FpuRoundMode.RNE)
+//        testCvtF64F32Raw( 1.1754943157898258E-38, 1.17549435E-38f , 3, FpuRoundMode.RMM)
+//        testCvtF64F32Raw( 1.1754942807573643E-38, 1.17549435E-38f , 3, FpuRoundMode.RMM)
+//        testCvtF64F32Raw(-1.1754943508051483E-38, -1.17549435E-38f, 1, FpuRoundMode.RMM)
+
+        //-1.1754943508051483E-38 -1.17549435E-38 1 RNE @ 592770
+        // 1.1754943157898258E-38 1.17549435E-38 3 RMM  @ 2697440
+        // 1.1754942807573643E-38 1.17549435E-38 3 RMM
+//        for(_ <- 0 until 1000000) testCvtF64F32() // 1 did not equal 3 Flag missmatch dut=1 ref=3 testCvtF64F32Raw 1.1754942807573643E-38 1.17549435E-38 RMM
+//        println("FCVT_D_S done")
+
 //        testBinaryOpF64(div, -2.2250738564511294E-308, 4.294967296003891E9,  -5.180654E-318, 1, FpuRoundMode.RDN,"div") // ??? wtf
 
 //        testBinaryOp(add,b2f(0x7F800000),b2f(0x1FD << 23),b2f(0x7F800000),0, FpuRoundMode.RNE,"add")
 
 
-        for(_ <- 0 until 1000000) testDivF64()
-        println("f64 div done")
 
-
-        for(_ <- 0 until 10000) testDiv()
-        println("f32 div done")
-
-        for(_ <- 0 until 10000) testAddF32()
-        for(_ <- 0 until 10000) testSubF32()
-
-        println("Add done")
-
-
-        for(_ <- 0 until 10000) testSqrt()
-        println("f32 sqrt done")
+//        testBinaryOp(mul,1.1753509E-38f, 1.0001221f ,1.17549435E-38f,1, FpuRoundMode.RNE,"mul")
+//
+//        for(i <- 0 until 10000000){
+//          val rounding = FpuRoundMode.elements.randomPick()
+//          val (a,b,c,f) = f32.mul(rounding).f32_f32_f32
+//          testBinaryOp(mul,a,b,c,f, rounding,"mul")
+//        }
+//
+//        testBinaryOpF64(mul,2.781342323134002E-309, 7.999999999999999, 2.2250738585072014E-308, 3, FpuRoundMode.RNE,"mul")
+////        for(i <- 0 until 10000000){
+////          val rounding = FpuRoundMode.RNE
+////          val (a,b,c,f) = f64.mul(rounding).f64_f64_f64
+////          testBinaryOpF64(mul,a,b,c,f, rounding,"mul")
+////        }
+//        for(_ <- 0 until 100000000) testMulF64()
+//        println("f64 Mul done")
+//
+//        for(_ <- 0 until 10000) testDivF64()
+//        println("f64 div done")
+//
+//
+//        for(_ <- 0 until 10000) testDiv()
+//        println("f32 div done")
+//
+//        for(_ <- 0 until 10000) testAddF32()
+//        for(_ <- 0 until 10000) testSubF32()
+//
+//        println("Add done")
+//
+//
+//        for(_ <- 0 until 10000) testSqrt()
+//        println("f32 sqrt done")
 
 
 
@@ -1533,8 +1575,8 @@ class FpuTest extends FunSuite{
 //  DoCmd.doCmd(cmd)
 //  val math = new FpuMath
 //}
-//// cd /media/data/open/SaxonSoc/testFloatBuild/berkeley-softfloat-3/build/Linux-x86_64-GCC
-//// make clean && SPECIALIZE_TYPE=RISCV make -j$(nproc) && cp softfloat.a /media/data/open/SaxonSoc/artyA7SmpUpdate/SaxonSoc/ext/VexRiscv/src/test/cpp/fpu/math
+// cd /media/data/open/SaxonSoc/testFloatBuild/berkeley-softfloat-3/build/Linux-x86_64-GCC
+// make clean && SPECIALIZE_TYPE=RISCV make -j$(nproc) && cp softfloat.a /media/data/open/SaxonSoc/artyA7SmpUpdate/SaxonSoc/ext/VexRiscv/src/test/cpp/fpu/math
 //object FpuCompileSo extends App{
 //
 ////  val b2f = lang.Float.intBitsToFloat(_)
@@ -1548,29 +1590,52 @@ class FpuTest extends FunSuite{
 ////  miaou ffffffff 7fffffe0 7f
 ////  miaou 0 3ffffff0 70 = 0
 //
+//  val b2f = lang.Float.intBitsToFloat(_)
+//  val b2d = lang.Double.longBitsToDouble(_)
+//  val f2b = lang.Float.floatToRawIntBits(_)
+//  val d2bOffset = BigInt("10000000000000000",16)
+//  def d2b(that : Double) = {
+//    val l = lang.Double.doubleToRawLongBits(that)
+//    var a = BigInt(l)
+//    if(l < 0) {
+//      a = d2bOffset + a
+//    }
+//    a
+//  }
+//  val builder =new  StringBuilder()
+//  for(i <- 0 until 256){
+////    builder ++= (Clib.math.mulF32(1.17548538251e-38f, b2f(f2b(1.0f)+i),0)).toString + "\n"
+//    val b = d2b(1.17549435082e-38)
+//    val s = (b-(i.toLong << 25)).toLong
+//    val d = b2d(s)
+//    builder ++= f"$b $s $d => "
+//    builder ++= f"${d2b(d)}%x   " + (Clib.math.d2fFlag(d,0)).toString + " " + d + " => " + (Clib.math.d2f(d,FpuRoundMode.RMM.position)).toString + "\n"
+//  }
 //
+//  Thread.sleep(400)
+//  println(builder.toString)
 //    println(Clib.math.mulF32( 1.1753509E-38f, 1.0001221f, FpuRoundMode.RUP.position))
 //    println(Clib.math.mulF32( 1.1754945E-38f, 0.9999998f, FpuRoundMode.RUP.position))
-////  testBinaryOp(mul, 1.1753509E-38f, 1.0001221f, 1.17549435E-38f ,1, FpuRoundMode.RUP,"mul")
-////  testBinaryOp(mul, 1.1754945E-38f, 0.9999998f, 1.17549435E-38f, 3, FpuRoundMode.RUP, "mul")
-////  miaou ffffffff 7fffffe0 7f
-////  miaou 0 3ffffff0 70 = 0
-////  miaou ffffffff 7fffff7e 7f
-////  miaou 1 3fffffbf 3f = 1
-//
-////  println(Clib.math.mulF32( 1.1753509E-38f, 1.0001221f, FpuRoundMode.RUP.position))
-////  println(Clib.math.mulF32( 1.469368E-39f, 7.9999995f, FpuRoundMode.RUP.position))
-////  println(Clib.math.mulF32( 1.40129846432e-45f, 7.9999995f, FpuRoundMode.RUP.position))
-////  println(Clib.math.mulF32( 2.93873587706e-39f, 7.9999995f, FpuRoundMode.RUP.position))
-////  println(Clib.math.mulF32( 1f, 7.9999995f, FpuRoundMode.RUP.position))
-//
-//
-////  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RNE.position))
-////  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RTZ.position))
-////  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RDN.position))
-////  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RUP.position))
+//  testBinaryOp(mul, 1.1753509E-38f, 1.0001221f, 1.17549435E-38f ,1, FpuRoundMode.RUP,"mul")
+//  testBinaryOp(mul, 1.1754945E-38f, 0.9999998f, 1.17549435E-38f, 3, FpuRoundMode.RUP, "mul")
+//  miaou ffffffff 7fffffe0 7f
+//  miaou 0 3ffffff0 70 = 0
+//  miaou ffffffff 7fffff7e 7f
+//  miaou 1 3fffffbf 3f = 1
+
+//  println(Clib.math.mulF32( 1.1753509E-38f, 1.0001221f, FpuRoundMode.RUP.position))
+//  println(Clib.math.mulF32( 1.469368E-39f, 7.9999995f, FpuRoundMode.RUP.position))
+//  println(Clib.math.mulF32( 1.40129846432e-45f, 7.9999995f, FpuRoundMode.RUP.position))
+//  println(Clib.math.mulF32( 2.93873587706e-39f, 7.9999995f, FpuRoundMode.RUP.position))
+//  println(Clib.math.mulF32( 1f, 7.9999995f, FpuRoundMode.RUP.position))
+
+
+//  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RNE.position))
+//  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RTZ.position))
+//  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RDN.position))
+//  println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RUP.position))
 //}
-//
+
 class ProcessStream(cmd : String){
   import sys.process._
 
@@ -1590,19 +1655,4 @@ class ProcessStream(cmd : String){
     buf.dequeue()()
   }
 }
-//
-//object TestSoftFloat extends App{
-//  val p = new ProcessStream("testfloat_gen -forever f32_add")
-//  Thread.sleep(1000)
-//  println(p.next)
-//  println(p.next)
-//  println(p.next)
-//  println(p.next)
-//  println(p.next)
-//  Thread.sleep(1000)
-//  println(p.next)
-//  while(true) {
-//    Thread.sleep(10)
-//    println(p.next)
-//  }
-//}
+