fpu added proper rounding for add (need to manage substraction)

This commit is contained in:
Dolu1990 2021-01-28 00:25:16 +01:00
parent 195e4c422d
commit 1ae84ea83b
7 changed files with 252 additions and 52 deletions

View file

@ -30,6 +30,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rd = p.rfAddress()
val value = Bits(32 bits)
val arg = p.Arg()
val roundMode = FpuRoundMode()
}
case class RfReadOutput() extends Bundle{
@ -40,6 +41,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rd = p.rfAddress()
val value = Bits(32 bits)
val arg = p.Arg()
val roundMode = FpuRoundMode()
}
@ -49,6 +51,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val lockId = lockIdType()
val i2f = Bool()
val arg = Bits(2 bits)
val roundMode = FpuRoundMode()
}
case class ShortPipInput() extends Bundle{
@ -61,6 +64,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val value = Bits(32 bits)
val arg = Bits(2 bits)
def rs1 = rs1Raw.as(p.internalFloating)
val roundMode = FpuRoundMode()
}
case class MulInput() extends Bundle{
@ -71,6 +75,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val add = Bool()
val divSqrt = Bool()
val msb1, msb2 = Bool() //allow usage of msb bits of mul
val roundMode = FpuRoundMode()
}
@ -80,6 +85,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rd = p.rfAddress()
val lockId = lockIdType()
val div = Bool()
val roundMode = FpuRoundMode()
}
@ -88,16 +94,26 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rs1, rs2 = p.internalFloating()
val rd = p.rfAddress()
val lockId = lockIdType()
val roundMode = FpuRoundMode()
}
case class WriteInput() extends Bundle{
case class MergeInput() extends Bundle{
val source = Source()
val lockId = lockIdType()
val rd = p.rfAddress()
val value = p.internalFloating()
val round = UInt(2 bits)
val roundMode = FpuRoundMode()
}
case class RoundOutput() extends Bundle{
val source = Source()
val lockId = lockIdType()
val rd = p.rfAddress()
val value = p.internalFloating()
}
val rf = new Area{
val ram = Mem(p.internalFloating, 32*portCount)
val lock = for(i <- 0 until rfLockCount) yield new Area{
@ -222,6 +238,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
output.lockId := s1LockId
output.value := s1.value
output.arg := s1.arg
output.roundMode := s1.roundMode
output.rd := s1.rd
output.rs1 := rf.ram.readSync(s0.source @@ s0.rs1,enable = !output.isStall)
output.rs2 := rf.ram.readSync(s0.source @@ s0.rs2,enable = !output.isStall)
@ -298,6 +315,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val value = p.storeLoadType()
val i2f = Bool()
val arg = Bits(2 bits)
val roundMode = FpuRoundMode()
}
val s0 = new Area{
@ -315,6 +333,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
output.value := feed.value
output.i2f := input.i2f
output.arg := input.arg
output.roundMode := input.roundMode
}
@ -406,17 +425,20 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
when(isInfinity){recoded.setInfinity}
when(isNan){recoded.setNan}
val output = input.haltWhen(busy).swapPayload(WriteInput())
val output = input.haltWhen(busy).swapPayload(MergeInput())
output.source := input.source
output.lockId := input.lockId
output.roundMode := input.roundMode
output.rd := input.rd
output.value := recoded
output.round := 0
when(input.i2f){
output.value.sign := i2fSign
output.value.exponent := (U(exponentOne+31) - fsm.manTop).resized
output.value.mantissa := U(i2fShifted)
output.value.setNormal
when(fsm.i2fZero) { output.value.setZero }
//TODO ROUND
}
}
}
@ -424,7 +446,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val shortPip = new Area{
val input = decode.shortPip.stage()
val rfOutput = Stream(WriteInput())
val rfOutput = Stream(MergeInput())
val result = p.storeLoadType().assignDontCare()
@ -563,6 +585,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
rfOutput.source := input.source
rfOutput.lockId := input.lockId
rfOutput.rd := input.rd
rfOutput.roundMode := input.roundMode
rfOutput.round := 0 //TODO
rfOutput.value.assignDontCare()
switch(input.opcode){
is(FpuOpcode.MIN_MAX){
@ -634,11 +658,13 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
output.payload := math.mulC(p.internalMantissaSize, p.internalMantissaSize+1 bits)
}
val output = Stream(WriteInput())
val output = Stream(MergeInput())
output.valid := input.valid && !input.add && !input.divSqrt
output.source := input.source
output.lockId := input.lockId
output.rd := input.rd
output.roundMode := input.roundMode
output.round := 0 //TODO
output.value := norm.output
decode.mulToAdd.valid := input.valid && input.add
@ -650,6 +676,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
decode.mulToAdd.rs2 := input.rs3
decode.mulToAdd.rd := input.rd
decode.mulToAdd.lockId := input.lockId
decode.mulToAdd.roundMode := input.roundMode
input.ready := (input.add ? decode.mulToAdd.ready | output.ready) || input.divSqrt
}
@ -681,6 +708,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
decode.divSqrtToMul.msb2 := True
decode.divSqrtToMul.rs1.special := False //TODO
decode.divSqrtToMul.rs2.special := False
decode.divSqrtToMul.roundMode := input.roundMode
val aprox = new Area {
@ -845,7 +873,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val rs1MantissaBigger = input.rs1.mantissa > input.rs2.mantissa
val absRs1Bigger = ((rs1ExponentBigger || rs1ExponentEqual && rs1MantissaBigger) && !input.rs1.isZero || input.rs1.isInfinity) && !input.rs2.isInfinity
val shiftBy = rs1ExponentBigger ? (0-exp21) | exp21
val passThrough = shiftBy >= p.internalMantissaSize || (input.rs1.isZero) || (input.rs2.isZero)
val shiftOverflow = shiftBy >= p.internalMantissaSize
val passThrough = shiftOverflow || (input.rs1.isZero) || (input.rs2.isZero)
//Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
@ -853,7 +882,14 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val ySign = xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign)
val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) @@ U"0"
val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) @@ U"0"
val yMantissa = yMantissaUnshifted >> (passThrough.asUInt @@ shiftBy.resize(log2Up(p.internalMantissaSize)))
var yMantissa = yMantissaUnshifted
val roundingScrap = CombInit(shiftOverflow)
for(i <- 0 until log2Up(p.internalMantissaSize)){
roundingScrap setWhen(shiftBy(i) && yMantissa(0, 1 << i bits) =/= 0)
yMantissa \= shiftBy(i) ? (yMantissa |>> (BigInt(1) << i)) | yMantissa
}
when(passThrough) { yMantissa := 0 }
// val yMantissa = yMantissaUnshifted >> (passThrough.asUInt @@ shiftBy.resize(log2Up(p.internalMantissaSize))) //Maybe passThrough.asUInt @@ do not infer small logic
val xyExponent = rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent
}
@ -866,9 +902,9 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
def xySign = shifter.xySign
val xSigned = xMantissa.twoComplement(xSign)
// val ySigned = (yMantissa +^ (yMantissa.lsb && !ySign).asUInt).twoComplement(ySign)
val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt +^ (ySign || yMantissa.lsb).asUInt).asSInt //rounding here
val xyMantissa = U(xSigned + ySigned).trim(1 bits)
val ySigned = yMantissa.twoComplement(ySign)
// val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt +^ (ySign || yMantissa.lsb).asUInt).asSInt //rounding here
val xyMantissa = U(xSigned +^ ySigned).trim(1 bits)
}
val norm = new Area{
@ -878,9 +914,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
val shiftOh = OHMasking.first(xyMantissa.asBools.reverse)
val shift = OHToUInt(shiftOh)
val mantissa = (xyMantissa |<< shift) >> 2
// val mantissaShifted = (xyMantissa |<< shift)
// val mantissa = ((xyMantissa ) >> 2) + U(xyMantissa(1))
val mantissa = (xyMantissa |<< shift)
val exponent = xyExponent -^ shift + 1
xySign clearWhen(input.rs1.isZero && input.rs2.isZero)
val forceZero = xyMantissa === 0 || exponent.msb || (input.rs1.isZero && input.rs2.isZero)
@ -889,14 +923,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}
val output = input.swapPayload(WriteInput())
val output = input.swapPayload(MergeInput())
output.source := input.source
output.lockId := input.lockId
output.rd := input.rd
output.value.sign := norm.xySign
output.value.mantissa := norm.mantissa.resized
output.value.mantissa := (norm.mantissa >> 2).resized
output.value.exponent := norm.exponent.resized
output.value.special := False
output.roundMode := input.roundMode
output.round := norm.mantissa(1 downto 0) | (U"0" @@ shifter.roundingScrap)
when(norm.forceNan) {
output.value.setNanQuiet
@ -911,25 +947,59 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
}
val write = new Area{
val merge = new Area {
val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(List(load.s1.output, add.output, mul.output, shortPip.rfOutput))
val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId)
val commited = arbitrated.haltWhen(!isCommited).toFlow
}
for(i <- 0 until portCount){
completion(i).increments += (RegNext(commited.fire && commited.source === i) init(False))
val round = new Area{
val input = merge.commited.combStage
val mantissaIncrement = !input.value.special && input.roundMode.mux(
FpuRoundMode.RNE -> (input.round(1) && (input.round(0) || input.value.mantissa.lsb)),
FpuRoundMode.RTZ -> False,
FpuRoundMode.RDN -> (input.round =/= 0 && input.value.sign),
FpuRoundMode.RUP -> (input.round =/= 0 && !input.value.sign),
FpuRoundMode.RMM -> (input.round(1))
)
val math = p.internalFloating()
val adder = (input.value.exponent @@ input.value.mantissa) + U(mantissaIncrement)
math.special := input.value.special
math.sign := input.value.sign
math.exponent := adder(p.internalMantissaSize, p.internalExponentSize bits)
math.mantissa := adder(0, p.internalMantissaSize bits)
val patched = CombInit(math)
when(!input.value.special && math.exponent === exponentOne + 128){
patched.setInfinity
}
when(commited.valid){
for(i <- 0 until rfLockCount) when(commited.lockId === i){
val output = input.swapPayload(RoundOutput())
output.source := input.source
output.lockId := input.lockId
output.rd := input.rd
output.value := patched
}
val writeback = new Area{
val input = round.output.combStage
for(i <- 0 until portCount){
completion(i).increments += (RegNext(input.fire && input.source === i) init(False))
}
when(input.valid){
for(i <- 0 until rfLockCount) when(input.lockId === i){
rf.lock(i).valid := False
}
}
val port = rf.ram.writePort
port.valid := commited.valid && rf.lock.map(_.write).read(commited.lockId)
port.address := commited.source @@ commited.rd
port.data := commited.value
port.valid := input.valid && rf.lock.map(_.write).read(input.lockId)
port.address := input.source @@ input.rd
port.data := input.value
when(port.valid){
assert(!(port.data.exponent === 0 && !port.data.special), "Special violation")

View file

@ -89,6 +89,21 @@ object FpuFormat extends SpinalEnum{
val FLOAT, DOUBLE = newElement()
}
object FpuRoundMode extends SpinalEnum(defaultEncoding = binarySequential){
val RNE, RTZ, RDN, RUP, RMM = newElement()
}
object FpuRoundModeInstr extends SpinalEnum(){
val RNE, RTZ, RDN, RUP, RMM, DYN = newElement()
defaultEncoding = SpinalEnumEncoding("opt")(
RNE -> 0,
RTZ -> 1,
RDN -> 2,
RUP -> 3,
RMM -> 4,
DYN -> 7
)
}
case class FpuParameter( internalMantissaSize : Int,
withDouble : Boolean){
@ -120,6 +135,7 @@ case class FpuCmd(p : FpuParameter) extends Bundle{
val rs1, rs2, rs3 = p.rfAddress()
val rd = p.rfAddress()
val format = p.Format()
val roundMode = FpuRoundMode()
}
case class FpuCommit(p : FpuParameter) extends Bundle{

View file

@ -166,14 +166,18 @@ class FpuPlugin(externalFpu : Boolean = false,
arbitration.haltItself setWhen(arbitration.isValid && input(FPU_ENABLE) && hazard)
arbitration.haltItself setWhen(port.cmd.isStall)
port.cmd.valid := arbitration.isValid && input(FPU_ENABLE) && !forked && !hazard
port.cmd.opcode := input(FPU_OPCODE)
port.cmd.arg := input(FPU_ARG)
port.cmd.rs1 := ((input(FPU_OPCODE) === FpuOpcode.STORE) ? input(INSTRUCTION)(rs2Range).asUInt | input(INSTRUCTION)(rs1Range).asUInt)
port.cmd.rs2 := input(INSTRUCTION)(rs2Range).asUInt
port.cmd.rs3 := input(INSTRUCTION)(rs3Range).asUInt
port.cmd.rd := input(INSTRUCTION)(rdRange).asUInt
port.cmd.format := FpuFormat.FLOAT
val iRoundMode = input(INSTRUCTION)(funct3Range)
val roundMode = (input(INSTRUCTION)(funct3Range) === B"111") ? csr.rm | input(INSTRUCTION)(funct3Range)
port.cmd.valid := arbitration.isValid && input(FPU_ENABLE) && !forked && !hazard
port.cmd.opcode := input(FPU_OPCODE)
port.cmd.arg := input(FPU_ARG)
port.cmd.rs1 := ((input(FPU_OPCODE) === FpuOpcode.STORE) ? input(INSTRUCTION)(rs2Range).asUInt | input(INSTRUCTION)(rs1Range).asUInt)
port.cmd.rs2 := input(INSTRUCTION)(rs2Range).asUInt
port.cmd.rs3 := input(INSTRUCTION)(rs3Range).asUInt
port.cmd.rd := input(INSTRUCTION)(rdRange).asUInt
port.cmd.format := FpuFormat.FLOAT
port.cmd.roundMode := roundMode.as(FpuRoundMode())
insert(FPU_FORKED) := forked || port.cmd.fire

View file

@ -0,0 +1,50 @@
#include <stdio.h>
#include <stdint.h>
#include <stdint.h>
#include <jni.h>
#include <softfloat.h>
extern void miaou();
//#include <fenv.h>
//#pragma STDC FENV_ACCESS ON
//int applyRounding(int rounding){
// int ret = fegetround( );
// switch(rounding){
// case 0: fesetround(FE_TONEAREST); break;
// case 1: fesetround(FE_TOWARDZERO); break;
// case 2: fesetround(FE_DOWNWARD); break;
// case 3: fesetround(FE_UPWARD); break;
// }
// return ret;
//}
// const int originalRounding = applyRounding(rounding);
// fesetround(originalRounding);
void applyRounding(int rounding){
switch(rounding){
case 0: softfloat_roundingMode = 0; break;
case 1: softfloat_roundingMode = 1; break;
case 2: softfloat_roundingMode = 2; break;
case 3: softfloat_roundingMode = 3; break;
case 4: softfloat_roundingMode = 4; break;
}
}
#define API __attribute__((visibility("default")))
//float32_t toF32(float v){
// float32_t x;
// x.v = ;
// return x;
//}
#define toF32(v) (*((float32_t*)&v))
#define fromF32(x) (*((float*)&(x.v)))
JNIEXPORT jfloat API JNICALL Java_vexriscv_ip_fpu_FpuMath_addF32(JNIEnv * env, jobject obj, jfloat a, jfloat b, jint rounding){
applyRounding(rounding);
float32_t v = f32_add(toF32(a), toF32(b));
return fromF32(v);
}

View file

@ -0,0 +1,4 @@
CODEABI_1.0 {
global: FpuMath_*;
local: *;
}

View file

@ -0,0 +1,11 @@
package vexriscv.ip.fpu;
import java.io.File;
public class FpuMath {
public native float addF32(float a, float b, int rounding);
static{
System.load(new File("src/test/cpp/fpu/math/fpu_math.so").getAbsolutePath());
}
}

View file

@ -1,12 +1,16 @@
package vexriscv.ip.fpu
import java.io.File
import java.lang
import org.apache.commons.io.FileUtils
import org.scalatest.FunSuite
import spinal.core.SpinalEnumElement
import spinal.core.sim._
import spinal.lib.DoCmd
import spinal.lib.experimental.math.Floating
import spinal.lib.sim._
import spinal.sim.Backend.{isMac, isWindows}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
@ -113,7 +117,7 @@ class FpuTest extends FunSuite{
}
}
def add(rd : Int, rs1 : Int, rs2 : Int): Unit ={
def add(rd : Int, rs1 : Int, rs2 : Int, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
cmdQueue += {cmd =>
cmd.opcode #= cmd.opcode.spinalEnum.ADD
cmd.rs1 #= rs1
@ -121,6 +125,7 @@ class FpuTest extends FunSuite{
cmd.rs3.randomize()
cmd.rd #= rd
cmd.arg #= 0
cmd.roundMode #= rounding
}
commitQueue += {cmd =>
cmd.write #= true
@ -318,20 +323,21 @@ class FpuTest extends FunSuite{
(Random.nextDouble() * (Math.pow(2.0, exp)) * (if(Random.nextBoolean()) -1.0 else 1.0)).toFloat
}
def testAdd(a : Float, b : Float): Unit ={
def testAdd(a : Float, b : Float, rounding : FpuRoundMode.E = FpuRoundMode.RNE): Unit ={
val rs = new RegAllocator()
val rs1, rs2, rs3 = rs.allocate()
val rd = Random.nextInt(32)
load(rs1, a)
load(rs2, b)
add(rd,rs1,rs2)
add(rd,rs1,rs2, rounding)
storeFloat(rd){v =>
val a_ = clamp(a)
val b_ = clamp(b)
val ref = clamp(a_ + b_)
println(f"$a + $b = $v, $ref")
assert(checkFloat(ref, v))
val ref = Clib.math.addF32(a,b, rounding.position)
println(f"${a}%.19f + $b%.19f = $v, $ref $rounding")
println(f"${f2b(a).toHexString} + ${f2b(b).toHexString}")
assert(checkFloatExact(ref, v))
}
}
@ -547,6 +553,39 @@ class FpuTest extends FunSuite{
val iSigned = iSmall ++ iSmall.map(-_) ++ iBigSigned
val roundingModes = FpuRoundMode.elements
def foreachRounding(body : FpuRoundMode.E => Unit): Unit ={
for(rounding <- roundingModes){
body(rounding)
}
}
//TODO test and fix a - b rounding
foreachRounding(testAdd(1.0f, b2f(0x3f800001), _)) //1.00001
foreachRounding(testAdd(4.0f, b2f(0x3f800001), _)) //1.00001
for(_ <- 0 until 10000; a = randomFloat(); b = randomFloat()) foreachRounding(testAdd(a.abs, b.abs,_)) //TODO negative
waitUntil(cmdQueue.isEmpty)
dut.clockDomain.waitSampling(1000)
simSuccess()
testAdd(b2f(0x3f800000), b2f(0x3f800000-1))
testAdd(1.1f, 2.3f)
testAdd(1.2f, -1.2f)
testAdd(-1.2f, 1.2f)
testAdd(0.0f, -1.2f)
testAdd(-0.0f, -1.2f)
testAdd(1.2f, -0f)
testAdd(1.2f, 0f)
testAdd(1.1f, Float.MinPositiveValue)
for(a <- fAll; _ <- 0 until 50) testAdd(a, randomFloat())
for(b <- fAll; _ <- 0 until 50) testAdd(randomFloat(), b)
for(a <- fAll; b <- fAll) testAdd(a, b)
for(_ <- 0 until 1000) testAdd(randomFloat(), randomFloat())
testLoadStore(1.17549435082e-38f)
testLoadStore(1.4E-45f)
testLoadStore(3.44383110592e-41f)
@ -573,21 +612,6 @@ class FpuTest extends FunSuite{
testAdd(b2f(0x3f800000), b2f(0x3f800000-1))
testAdd(1.1f, 2.3f)
testAdd(1.2f, -1.2f)
testAdd(-1.2f, 1.2f)
testAdd(0.0f, -1.2f)
testAdd(-0.0f, -1.2f)
testAdd(1.2f, -0f)
testAdd(1.2f, 0f)
testAdd(1.1f, Float.MinPositiveValue)
for(a <- fAll; _ <- 0 until 50) testAdd(a, randomFloat())
for(b <- fAll; _ <- 0 until 50) testAdd(randomFloat(), b)
for(a <- fAll; b <- fAll) testAdd(a, b)
for(_ <- 0 until 1000) testAdd(randomFloat(), randomFloat())
testLoadStore(1.2f)
@ -796,3 +820,24 @@ class FpuTest extends FunSuite{
}
}
}
object Clib {
val java_home = System.getProperty("java.home")
assert(java_home != "" && java_home != null, "JAVA_HOME need to be set")
val jdk = java_home.replace("/jre","").replace("\\jre","")
val jdkIncludes = jdk + "/include"
val flags = List("-fPIC", "-m64", "-shared", "-Wno-attributes") //-Wl,--whole-archive
val os = new File("/media/data/open/SaxonSoc/berkeley-softfloat-3/build/Linux-x86_64-GCC").listFiles().map(_.getAbsolutePath).filter(_.toString.endsWith(".o"))
val cmd = s"gcc -I/media/data/open/SaxonSoc/berkeley-softfloat-3/source/include -I$jdkIncludes -I$jdkIncludes/linux ${flags.mkString(" ")} -o src/test/cpp/fpu/math/fpu_math.so src/test/cpp/fpu/math/fpu_math.c src/test/cpp/fpu/math/softfloat.a" // src/test/cpp/fpu/math/softfloat.a
DoCmd.doCmd(cmd)
val math = new FpuMath
}
object FpuCompileSo extends App{
println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RNE.position))
println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RTZ.position))
println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RDN.position))
println(Clib.math.addF32(1.00000011921f, 4.0f, FpuRoundMode.RUP.position))
}