fpu improve fmax
This commit is contained in:
parent
1e647f799c
commit
8537d18b16
|
@ -83,7 +83,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
|
val rs1Boxed, rs2Boxed = p.withDouble generate Bool()
|
||||||
}
|
}
|
||||||
|
|
||||||
case class MulInput() extends Bundle{
|
class MulInput() extends Bundle{
|
||||||
val source = Source()
|
val source = Source()
|
||||||
val rs1, rs2, rs3 = p.internalFloating()
|
val rs1, rs2, rs3 = p.internalFloating()
|
||||||
val rd = p.rfAddress()
|
val rd = p.rfAddress()
|
||||||
|
@ -117,7 +117,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
case class MergeInput() extends Bundle{
|
class MergeInput() extends Bundle{
|
||||||
val source = Source()
|
val source = Source()
|
||||||
val lockId = lockIdType()
|
val lockId = lockIdType()
|
||||||
val rd = p.rfAddress()
|
val rd = p.rfAddress()
|
||||||
|
@ -174,7 +174,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
val fork = new StreamFork(FpuCommit(p), 2)
|
val fork = new StreamFork(FpuCommit(p), 2)
|
||||||
fork.io.input << io.port(i).commit
|
fork.io.input << io.port(i).commit
|
||||||
fork.io.outputs(0) >> load(i)
|
fork.io.outputs(0) >> load(i)
|
||||||
fork.io.outputs(1) >> commit(i)
|
fork.io.outputs(1).pipelined(m2s = true, s2m = true) >> commit(i) //Pipelining here is light, as it only use the flags of the payload
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,16 +198,16 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO nan boxing decoding
|
|
||||||
val read = new Area{
|
val read = new Area{
|
||||||
val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
|
val arbiter = StreamArbiterFactory.noLock.roundRobin.build(FpuCmd(p), portCount)
|
||||||
arbiter.io.inputs <> Vec(io.port.map(_.cmd))
|
arbiter.io.inputs <> Vec(io.port.map(_.cmd))
|
||||||
|
|
||||||
val s0 = Stream(RfReadInput())
|
val arbiterOutput = Stream(RfReadInput())
|
||||||
s0.arbitrationFrom(arbiter.io.output)
|
arbiterOutput.arbitrationFrom(arbiter.io.output)
|
||||||
s0.source := arbiter.io.chosen
|
arbiterOutput.source := arbiter.io.chosen
|
||||||
s0.payload.assignSomeByName(arbiter.io.output.payload)
|
arbiterOutput.payload.assignSomeByName(arbiter.io.output.payload)
|
||||||
|
|
||||||
|
val s0 = arbiterOutput.pipelined(m2s = true, s2m = true)
|
||||||
val useRs1, useRs2, useRs3, useRd = False
|
val useRs1, useRs2, useRs3, useRd = False
|
||||||
switch(s0.opcode){
|
switch(s0.opcode){
|
||||||
is(p.Opcode.LOAD) { useRd := True }
|
is(p.Opcode.LOAD) { useRd := True }
|
||||||
|
@ -314,8 +314,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
|
|
||||||
val fmaHit = input.opcode === p.Opcode.FMA
|
val fmaHit = input.opcode === p.Opcode.FMA
|
||||||
val mulHit = input.opcode === p.Opcode.MUL || fmaHit
|
val mulHit = input.opcode === p.Opcode.MUL || fmaHit
|
||||||
val mul = Stream(MulInput())
|
val mul = Stream(new MulInput())
|
||||||
val divSqrtToMul = Stream(MulInput())
|
val divSqrtToMul = Stream(new MulInput())
|
||||||
|
|
||||||
if(p.withMul) {
|
if(p.withMul) {
|
||||||
input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid)
|
input.ready setWhen (mulHit && mul.ready && !divSqrtToMul.valid)
|
||||||
|
@ -369,7 +369,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
|
|
||||||
val s0 = new Area{
|
val s0 = new Area{
|
||||||
val input = decode.load.stage()
|
val input = decode.load.pipelined(m2s = true, s2m = true)
|
||||||
val filtred = commitFork.load.map(port => port.takeWhen(port.sync))
|
val filtred = commitFork.load.map(port => port.takeWhen(port.sync))
|
||||||
def feed = filtred(input.source)
|
def feed = filtred(input.source)
|
||||||
val hazard = !feed.valid
|
val hazard = !feed.valid
|
||||||
|
@ -390,7 +390,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
output.format := FpuFormat.FLOAT
|
output.format := FpuFormat.FLOAT
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val s1 = new Area{
|
val s1 = new Area{
|
||||||
|
@ -510,7 +509,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
when(isInfinity){recoded.setInfinity}
|
when(isInfinity){recoded.setInfinity}
|
||||||
when(isNan){recoded.setNan}
|
when(isNan){recoded.setNan}
|
||||||
|
|
||||||
val output = input.haltWhen(busy).swapPayload(MergeInput())
|
val output = input.haltWhen(busy).swapPayload(new MergeInput())
|
||||||
output.source := input.source
|
output.source := input.source
|
||||||
output.lockId := input.lockId
|
output.lockId := input.lockId
|
||||||
output.roundMode := input.roundMode
|
output.roundMode := input.roundMode
|
||||||
|
@ -540,7 +539,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
val shortPip = new Area{
|
val shortPip = new Area{
|
||||||
val input = decode.shortPip.stage()
|
val input = decode.shortPip.stage()
|
||||||
|
|
||||||
val rfOutput = Stream(MergeInput())
|
val rfOutput = Stream(new MergeInput())
|
||||||
|
|
||||||
val result = p.storeLoadType().assignDontCare()
|
val result = p.storeLoadType().assignDontCare()
|
||||||
|
|
||||||
|
@ -820,20 +819,61 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
|
|
||||||
val mul = p.withMul generate new Area{
|
val mul = p.withMul generate new Area{
|
||||||
val input = decode.mul.stage()
|
val inWidthA = p.internalMantissaSize+1
|
||||||
|
val inWidthB = p.internalMantissaSize+1
|
||||||
|
val outWidth = p.internalMantissaSize*2+2
|
||||||
|
|
||||||
val math = new Area {
|
case class MulSplit(offsetA : Int, offsetB : Int, widthA : Int, widthB : Int, id : Int){
|
||||||
|
val offsetC = offsetA+offsetB
|
||||||
|
}
|
||||||
|
val splitsUnordered = for(offsetA <- 0 until inWidthA by p.mulWidthA;
|
||||||
|
offsetB <- 0 until inWidthB by p.mulWidthB;
|
||||||
|
widthA = (inWidthA - offsetA) min p.mulWidthA;
|
||||||
|
widthB = (inWidthB - offsetB) min p.mulWidthB) yield {
|
||||||
|
MulSplit(offsetA, offsetB, widthA, widthB, -1)
|
||||||
|
}
|
||||||
|
val splits = splitsUnordered.sortWith(_.offsetC < _.offsetC).zipWithIndex.map(e => e._1.copy(id=e._2))
|
||||||
|
|
||||||
|
class MathWithExp extends MulInput{
|
||||||
|
val exp = UInt(p.internalExponentSize+1 bits)
|
||||||
|
}
|
||||||
|
val preMul = new Area{
|
||||||
|
val input = decode.mul.stage()
|
||||||
|
val output = input.swapPayload(new MathWithExp())
|
||||||
|
output.payload.assignSomeByName(input.payload)
|
||||||
|
output.exp := input.rs1.exponent +^ input.rs2.exponent
|
||||||
|
}
|
||||||
|
class MathWithMul extends MathWithExp{
|
||||||
|
val muls = Vec(splits.map(e => UInt(e.widthA + e.widthB bits)))
|
||||||
|
}
|
||||||
|
val mul = new Area{
|
||||||
|
val input = preMul.output.stage()
|
||||||
|
val output = input.swapPayload(new MathWithMul())
|
||||||
val mulA = U(input.msb1) @@ input.rs1.mantissa
|
val mulA = U(input.msb1) @@ input.rs1.mantissa
|
||||||
val mulB = U(input.msb2) @@ input.rs2.mantissa
|
val mulB = U(input.msb2) @@ input.rs2.mantissa
|
||||||
val mulC = mulA * mulB
|
output.payload.assignSomeByName(input.payload)
|
||||||
val exp = input.rs1.exponent +^ input.rs2.exponent
|
splits.foreach(e => output.muls(e.id) := mulA(e.offsetA, e.widthA bits) * mulB(e.offsetB, e.widthB bits))
|
||||||
|
}
|
||||||
|
|
||||||
|
class MathOutput extends MathWithExp{
|
||||||
|
val mulC = UInt(p.internalMantissaSize*2+2 bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
val math = new Area {
|
||||||
|
val input = mul.output.stage()
|
||||||
|
val sum = splits.map(e => (input.muls(e.id) << e.offsetC).resize(outWidth)).reduceBalancedTree(_ + _)
|
||||||
|
|
||||||
|
val output = input.swapPayload(new MathOutput())
|
||||||
|
output.payload.assignSomeByName(input.payload)
|
||||||
|
output.mulC := sum
|
||||||
}
|
}
|
||||||
|
|
||||||
val norm = new Area{
|
val norm = new Area{
|
||||||
val (mulHigh, mulLow) = math.mulC.splitAt(p.internalMantissaSize-1)
|
val input = math.output.stage()
|
||||||
|
val (mulHigh, mulLow) = input.mulC.splitAt(p.internalMantissaSize-1)
|
||||||
val scrap = mulLow =/= 0
|
val scrap = mulLow =/= 0
|
||||||
val needShift = mulHigh.msb
|
val needShift = mulHigh.msb
|
||||||
val exp = math.exp + U(needShift)
|
val exp = input.exp + U(needShift)
|
||||||
val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
|
val man = needShift ? mulHigh(1, p.internalMantissaSize+1 bits) | mulHigh(0, p.internalMantissaSize+1 bits)
|
||||||
scrap setWhen(needShift && mulHigh(0))
|
scrap setWhen(needShift && mulHigh(0))
|
||||||
val forceZero = input.rs1.isZero || input.rs2.isZero
|
val forceZero = input.rs1.isZero || input.rs2.isZero
|
||||||
|
@ -863,38 +903,40 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
} elsewhen(forceUnderflow) {
|
} elsewhen(forceUnderflow) {
|
||||||
output.exponent := underflowExp.resized
|
output.exponent := underflowExp.resized
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val notMul = new Area{
|
val result = new Area {
|
||||||
val output = Flow(UInt(p.internalMantissaSize + 1 bits))
|
def input = norm.input
|
||||||
output.valid := input.valid && input.divSqrt
|
val notMul = new Area {
|
||||||
output.payload := math.mulC(p.internalMantissaSize, p.internalMantissaSize+1 bits)
|
val output = Flow(UInt(p.internalMantissaSize + 1 bits))
|
||||||
|
output.valid := input.valid && input.divSqrt
|
||||||
|
output.payload := input.mulC(p.internalMantissaSize, p.internalMantissaSize + 1 bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
val output = Stream(new MergeInput())
|
||||||
|
output.valid := input.valid && !input.add && !input.divSqrt
|
||||||
|
output.source := input.source
|
||||||
|
output.lockId := input.lockId
|
||||||
|
output.rd := input.rd
|
||||||
|
if (p.withDouble) output.format := input.format
|
||||||
|
output.roundMode := input.roundMode
|
||||||
|
output.scrap := norm.scrap
|
||||||
|
output.value := norm.output
|
||||||
|
|
||||||
|
decode.mulToAdd.valid := input.valid && input.add
|
||||||
|
decode.mulToAdd.source := input.source
|
||||||
|
decode.mulToAdd.rs1.mantissa := norm.output.mantissa >> 1 //FMA Precision lost
|
||||||
|
decode.mulToAdd.rs1.exponent := norm.output.exponent
|
||||||
|
decode.mulToAdd.rs1.sign := norm.output.sign
|
||||||
|
decode.mulToAdd.rs1.special := False //TODO
|
||||||
|
decode.mulToAdd.rs2 := input.rs3
|
||||||
|
decode.mulToAdd.rd := input.rd
|
||||||
|
decode.mulToAdd.lockId := input.lockId
|
||||||
|
decode.mulToAdd.roundMode := input.roundMode
|
||||||
|
if (p.withDouble) decode.mulToAdd.format := input.format
|
||||||
|
|
||||||
|
input.ready := (input.add ? decode.mulToAdd.ready | output.ready) || input.divSqrt
|
||||||
}
|
}
|
||||||
|
|
||||||
val output = Stream(MergeInput())
|
|
||||||
output.valid := input.valid && !input.add && !input.divSqrt
|
|
||||||
output.source := input.source
|
|
||||||
output.lockId := input.lockId
|
|
||||||
output.rd := input.rd
|
|
||||||
if(p.withDouble) output.format := input.format
|
|
||||||
output.roundMode := input.roundMode
|
|
||||||
output.scrap := norm.scrap
|
|
||||||
output.value := norm.output
|
|
||||||
|
|
||||||
decode.mulToAdd.valid := input.valid && input.add
|
|
||||||
decode.mulToAdd.source := input.source
|
|
||||||
decode.mulToAdd.rs1.mantissa := norm.output.mantissa >> 1 //FMA Precision lost
|
|
||||||
decode.mulToAdd.rs1.exponent := norm.output.exponent
|
|
||||||
decode.mulToAdd.rs1.sign := norm.output.sign
|
|
||||||
decode.mulToAdd.rs1.special := False //TODO
|
|
||||||
decode.mulToAdd.rs2 := input.rs3
|
|
||||||
decode.mulToAdd.rd := input.rd
|
|
||||||
decode.mulToAdd.lockId := input.lockId
|
|
||||||
decode.mulToAdd.roundMode := input.roundMode
|
|
||||||
if(p.withDouble) decode.mulToAdd.format := input.format
|
|
||||||
|
|
||||||
input.ready := (input.add ? decode.mulToAdd.ready | output.ready) || input.divSqrt
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val divSqrt = p.withDivSqrt generate new Area {
|
val divSqrt = p.withDivSqrt generate new Area {
|
||||||
|
@ -965,7 +1007,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
decode.divSqrtToMul.msb2 := rs2.msb
|
decode.divSqrtToMul.msb2 := rs2.msb
|
||||||
}
|
}
|
||||||
|
|
||||||
val mulBuffer = mul.notMul.output.toStream.stage
|
val mulBuffer = mul.result.notMul.output.toStream.stage
|
||||||
mulBuffer.ready := False
|
mulBuffer.ready := False
|
||||||
|
|
||||||
val iterationValue = Reg(UInt(mulWidth bits))
|
val iterationValue = Reg(UInt(mulWidth bits))
|
||||||
|
@ -1081,9 +1123,20 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
|
|
||||||
val add = p.withAdd generate new Area{
|
val add = p.withAdd generate new Area{
|
||||||
val input = decode.add.stage()
|
|
||||||
|
class ShifterOutput extends AddInput{
|
||||||
|
val xSign, ySign = Bool()
|
||||||
|
val xMantissa, yMantissa = UInt(p.internalMantissaSize+3 bits)
|
||||||
|
val xyExponent = UInt(p.internalExponentSize bits)
|
||||||
|
val xySign = Bool()
|
||||||
|
val roundingScrap = Bool()
|
||||||
|
}
|
||||||
|
|
||||||
val shifter = new Area {
|
val shifter = new Area {
|
||||||
|
val input = decode.add.stage()
|
||||||
|
val output = input.swapPayload(new ShifterOutput)
|
||||||
|
output.payload.assignSomeByName(input.payload)
|
||||||
|
|
||||||
val exp21 = input.rs2.exponent -^ input.rs1.exponent
|
val exp21 = input.rs2.exponent -^ input.rs1.exponent
|
||||||
val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero
|
val rs1ExponentBigger = (exp21.msb || input.rs2.isZero) && !input.rs1.isZero
|
||||||
val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent
|
val rs1ExponentEqual = input.rs1.exponent === input.rs2.exponent
|
||||||
|
@ -1095,8 +1148,8 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
|
|
||||||
//Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
|
//Note that rs1ExponentBigger can be replaced by absRs1Bigger bellow to avoid xsigned two complement in math block at expense of combinatorial path
|
||||||
val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
|
val xySign = absRs1Bigger ? input.rs1.sign | input.rs2.sign
|
||||||
val xSign = xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign)
|
output.xSign := xySign ^ (rs1ExponentBigger ? input.rs1.sign | input.rs2.sign)
|
||||||
val ySign = xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign)
|
output.ySign := xySign ^ (rs1ExponentBigger ? input.rs2.sign | input.rs1.sign)
|
||||||
val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) @@ U"00"
|
val xMantissa = U"1" @@ (rs1ExponentBigger ? input.rs1.mantissa | input.rs2.mantissa) @@ U"00"
|
||||||
val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) @@ U"00"
|
val yMantissaUnshifted = U"1" @@ (rs1ExponentBigger ? input.rs2.mantissa | input.rs1.mantissa) @@ U"00"
|
||||||
var yMantissa = CombInit(yMantissaUnshifted)
|
var yMantissa = CombInit(yMantissaUnshifted)
|
||||||
|
@ -1108,66 +1161,86 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
when(passThrough) { yMantissa := 0 }
|
when(passThrough) { yMantissa := 0 }
|
||||||
when(shiftOverflow) { roundingScrap := True }
|
when(shiftOverflow) { roundingScrap := True }
|
||||||
when(input.rs1.special || input.rs2.special){ roundingScrap := False }
|
when(input.rs1.special || input.rs2.special){ roundingScrap := False }
|
||||||
val xyExponent = rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent
|
output.xyExponent := rs1ExponentBigger ? input.rs1.exponent | input.rs2.exponent
|
||||||
|
output.xMantissa := xMantissa
|
||||||
|
output.yMantissa := yMantissa
|
||||||
|
output.xySign := xySign
|
||||||
|
output.roundingScrap := roundingScrap
|
||||||
|
}
|
||||||
|
|
||||||
|
class MathOutput extends ShifterOutput{
|
||||||
|
val xyMantissa = UInt(p.internalMantissaSize+4 bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
val math = new Area {
|
val math = new Area {
|
||||||
def xSign = shifter.xSign
|
val input = shifter.output.stage()
|
||||||
def ySign = shifter.ySign
|
val output = input.swapPayload(new MathOutput)
|
||||||
def xMantissa = shifter.xMantissa
|
output.payload.assignSomeByName(input.payload)
|
||||||
def yMantissa = shifter.yMantissa
|
import input.payload._
|
||||||
def xyExponent = shifter.xyExponent
|
|
||||||
def xySign = shifter.xySign
|
|
||||||
|
|
||||||
val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ?
|
val xSigned = xMantissa.twoComplement(xSign) //TODO Is that necessary ?
|
||||||
val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !shifter.roundingScrap).asUInt).asSInt //rounding here
|
val ySigned = ((ySign ## Mux(ySign, ~yMantissa, yMantissa)).asUInt + (ySign && !roundingScrap).asUInt).asSInt //rounding here
|
||||||
val xyMantissa = U(xSigned +^ ySigned).trim(1 bits)
|
output.xyMantissa := U(xSigned +^ ySigned).trim(1 bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class NormOutput extends AddInput{
|
||||||
|
val mantissa = UInt(p.internalMantissaSize+4 bits)
|
||||||
|
val exponent = UInt(p.internalExponentSize+1 bits)
|
||||||
|
val infinityNan, forceNan, forceZero, forceInfinity = Bool()
|
||||||
|
val xySign, roundingScrap = Bool()
|
||||||
|
val xyMantissaZero = Bool()
|
||||||
}
|
}
|
||||||
|
|
||||||
val norm = new Area{
|
val norm = new Area{
|
||||||
def xyExponent = math.xyExponent
|
val input = math.output.stage()
|
||||||
def xyMantissa = math.xyMantissa
|
val output = input.swapPayload(new NormOutput)
|
||||||
val xySign = CombInit(math.xySign)
|
output.payload.assignSomeByName(input.payload)
|
||||||
|
import input.payload._
|
||||||
|
|
||||||
val shiftOh = OHMasking.first(xyMantissa.asBools.reverse)
|
val shiftOh = OHMasking.first(xyMantissa.asBools.reverse)
|
||||||
val shift = OHToUInt(shiftOh)
|
val shift = OHToUInt(shiftOh)
|
||||||
val mantissa = (xyMantissa |<< shift)
|
output.mantissa := (xyMantissa |<< shift)
|
||||||
val exponent = xyExponent -^ shift + 1
|
output.exponent := xyExponent -^ shift + 1
|
||||||
val forceZero = xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero)
|
output.forceInfinity := (input.rs1.isInfinity || input.rs2.isInfinity)
|
||||||
// val forceOverflow = exponent === exponentOne + 128 //Handled by writeback rounding
|
output.forceZero := xyMantissa === 0 || (input.rs1.isZero && input.rs2.isZero)
|
||||||
val forceInfinity = (input.rs1.isInfinity || input.rs2.isInfinity)
|
output.infinityNan := (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign))
|
||||||
val infinityNan = (input.rs1.isInfinity && input.rs2.isInfinity && (input.rs1.sign ^ input.rs2.sign))
|
output.forceNan := input.rs1.isNan || input.rs2.isNan || output.infinityNan
|
||||||
val forceNan = input.rs1.isNan || input.rs2.isNan || infinityNan
|
output.xyMantissaZero := xyMantissa === 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val result = new Area {
|
||||||
|
val input = norm.output.stage()
|
||||||
|
val output = input.swapPayload(new MergeInput())
|
||||||
|
import input.payload._
|
||||||
|
|
||||||
val output = input.swapPayload(MergeInput())
|
output.source := input.source
|
||||||
output.source := input.source
|
output.lockId := input.lockId
|
||||||
output.lockId := input.lockId
|
output.rd := input.rd
|
||||||
output.rd := input.rd
|
output.value.sign := xySign
|
||||||
output.value.sign := norm.xySign
|
output.value.mantissa := (mantissa >> 2).resized
|
||||||
output.value.mantissa := (norm.mantissa >> 2).resized
|
output.value.exponent := exponent.resized
|
||||||
output.value.exponent := norm.exponent.resized
|
output.value.special := False
|
||||||
output.value.special := False
|
output.roundMode := input.roundMode
|
||||||
output.roundMode := input.roundMode
|
if (p.withDouble) output.format := input.format
|
||||||
if(p.withDouble) output.format := input.format
|
output.scrap := (mantissa(1) | mantissa(0) | roundingScrap)
|
||||||
output.scrap := (norm.mantissa(1) | norm.mantissa(0) | shifter.roundingScrap)
|
|
||||||
|
|
||||||
|
|
||||||
val flag = io.port(input.source).completion.flag
|
val flag = io.port(input.source).completion.flag
|
||||||
flag.NV setWhen(input.valid && (norm.infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
|
flag.NV setWhen (input.valid && (infinityNan || input.rs1.isNanSignaling || input.rs2.isNanSignaling))
|
||||||
when(norm.forceNan) {
|
when(forceNan) {
|
||||||
output.value.setNanQuiet
|
output.value.setNanQuiet
|
||||||
} elsewhen(norm.forceZero) {
|
} elsewhen (forceZero) {
|
||||||
output.value.setZero
|
output.value.setZero
|
||||||
when(norm.xyMantissa === 0 || input.rs1.isZero && input.rs2.isZero){
|
when(xyMantissaZero || input.rs1.isZero && input.rs2.isZero) {
|
||||||
output.value.sign := input.rs1.sign && input.rs2.sign
|
output.value.sign := input.rs1.sign && input.rs2.sign
|
||||||
|
}
|
||||||
|
when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN) {
|
||||||
|
output.value.sign := True
|
||||||
|
}
|
||||||
|
} elsewhen (forceInfinity) {
|
||||||
|
output.value.setInfinity
|
||||||
}
|
}
|
||||||
when((input.rs1.sign || input.rs2.sign) && input.roundMode === FpuRoundMode.RDN){
|
|
||||||
output.value.sign := True
|
|
||||||
}
|
|
||||||
} elsewhen(norm.forceInfinity) {
|
|
||||||
output.value.setInfinity
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1175,37 +1248,55 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
val merge = new Area {
|
val merge = new Area {
|
||||||
//TODO maybe load can bypass merge and round.
|
//TODO maybe load can bypass merge and round.
|
||||||
val inputs = ArrayBuffer[Stream[MergeInput]]()
|
val inputs = ArrayBuffer[Stream[MergeInput]]()
|
||||||
inputs += load.s1.output
|
inputs += load.s1.output.stage()
|
||||||
if(p.withAdd) (inputs += add.output)
|
if(p.withAdd) (inputs += add.result.output)
|
||||||
if(p.withMul) (inputs += mul.output)
|
if(p.withMul) (inputs += mul.result.output)
|
||||||
if(p.withShortPipMisc) (inputs += shortPip.rfOutput)
|
if(p.withShortPipMisc) (inputs += shortPip.rfOutput)
|
||||||
val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs)
|
val arbitrated = StreamArbiterFactory.lowerFirst.noLock.on(inputs)
|
||||||
val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId)
|
val isCommited = rf.lock.map(_.commited).read(arbitrated.lockId)
|
||||||
val commited = arbitrated.haltWhen(!isCommited).toFlow
|
val commited = arbitrated.haltWhen(!isCommited).toFlow
|
||||||
}
|
}
|
||||||
|
|
||||||
val round = new Area{
|
class RoundFront extends MergeInput{
|
||||||
val input = merge.commited.combStage
|
val mantissaIncrement = Bool()
|
||||||
|
val roundAdjusted = Bits(2 bits)
|
||||||
|
val exactMask = UInt(p.internalMantissaSize + 2 bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
val roundFront = new Area {
|
||||||
|
val input = merge.commited.stage()
|
||||||
|
val output = input.swapPayload(new RoundFront())
|
||||||
|
output.payload.assignSomeByName(input.payload)
|
||||||
|
|
||||||
val manAggregate = input.value.mantissa @@ input.scrap
|
val manAggregate = input.value.mantissa @@ input.scrap
|
||||||
val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal+1)(exponentF32Subnormal+1)
|
val expBase = muxDouble[UInt](input.format)(exponentF64Subnormal + 1)(exponentF32Subnormal + 1)
|
||||||
val expDif = expBase -^ input.value.exponent
|
val expDif = expBase -^ input.value.exponent
|
||||||
val expSubnormal = !expDif.msb
|
val expSubnormal = !expDif.msb
|
||||||
var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0))
|
var discardCount = (expSubnormal ? expDif.resize(log2Up(p.internalMantissaSize) bits) | U(0))
|
||||||
if(p.withDouble) when(input.format === FpuFormat.FLOAT){
|
if (p.withDouble) when(input.format === FpuFormat.FLOAT) {
|
||||||
discardCount \= discardCount + 29
|
discardCount \= discardCount + 29
|
||||||
}
|
}
|
||||||
val exactMask = (List(True) ++ (0 until p.internalMantissaSize+1).map(_ < discardCount)).asBits.asUInt
|
val exactMask = (List(True) ++ (0 until p.internalMantissaSize + 1).map(_ < discardCount)).asBits.asUInt
|
||||||
val roundAdjusted = (True ## (manAggregate>>1))(discardCount) ## ((manAggregate & exactMask) =/= 0)
|
val roundAdjusted = (True ## (manAggregate >> 1)) (discardCount) ## ((manAggregate & exactMask) =/= 0)
|
||||||
|
|
||||||
val mantissaIncrement = !input.value.special && input.roundMode.mux(
|
val mantissaIncrement = !input.value.special && input.roundMode.mux(
|
||||||
FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate>>2))(discardCount))),
|
FpuRoundMode.RNE -> (roundAdjusted(1) && (roundAdjusted(0) || (U"01" ## (manAggregate >> 2)) (discardCount))),
|
||||||
FpuRoundMode.RTZ -> False,
|
FpuRoundMode.RTZ -> False,
|
||||||
FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign),
|
FpuRoundMode.RDN -> (roundAdjusted =/= 0 && input.value.sign),
|
||||||
FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign),
|
FpuRoundMode.RUP -> (roundAdjusted =/= 0 && !input.value.sign),
|
||||||
FpuRoundMode.RMM -> (roundAdjusted(1))
|
FpuRoundMode.RMM -> (roundAdjusted(1))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
output.mantissaIncrement := mantissaIncrement
|
||||||
|
output.roundAdjusted := roundAdjusted
|
||||||
|
output.exactMask := exactMask
|
||||||
|
}
|
||||||
|
|
||||||
|
val roundBack = new Area{
|
||||||
|
val input = roundFront.output.stage()
|
||||||
|
val output = input.swapPayload(RoundOutput())
|
||||||
|
import input.payload._
|
||||||
|
|
||||||
val math = p.internalFloating()
|
val math = p.internalFloating()
|
||||||
val mantissaRange = p.internalMantissaSize downto 1
|
val mantissaRange = p.internalMantissaSize downto 1
|
||||||
val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue)
|
val adderMantissa = input.value.mantissa(mantissaRange) & (mantissaIncrement ? ~(exactMask.trim(1) >> 1) | input.value.mantissa(mantissaRange).maxValue)
|
||||||
|
@ -1218,12 +1309,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
|
|
||||||
val patched = CombInit(math)
|
val patched = CombInit(math)
|
||||||
val nx,of,uf = False
|
val nx,of,uf = False
|
||||||
// val ufPatch = input.roundMode === FpuRoundMode.RUP && !input.value.sign && !input.scrap|| input.roundMode === FpuRoundMode.RDN && input.value.sign && !input.scrap
|
|
||||||
// when(!math.special && (input.value.exponent <= exponentOne-127 && (math.exponent =/= exponentOne-126 || !input.value.mantissa.lsb || ufPatch)) && roundAdjusted.asUInt =/= 0){
|
|
||||||
// uf := True
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
|
val ufSubnormalThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal)(exponentF32Subnormal)
|
||||||
val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
|
val ufThreshold = muxDouble[UInt](input.format)(exponentF64Subnormal-52+1)(exponentF32Subnormal-23+1)
|
||||||
|
@ -1277,7 +1362,6 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
flag.OF setWhen(of)
|
flag.OF setWhen(of)
|
||||||
flag.UF setWhen(uf)
|
flag.UF setWhen(uf)
|
||||||
}
|
}
|
||||||
val output = input.swapPayload(RoundOutput())
|
|
||||||
output.source := input.source
|
output.source := input.source
|
||||||
output.lockId := input.lockId
|
output.lockId := input.lockId
|
||||||
output.rd := input.rd
|
output.rd := input.rd
|
||||||
|
@ -1286,7 +1370,7 @@ case class FpuCore( portCount : Int, p : FpuParameter) extends Component{
|
||||||
}
|
}
|
||||||
|
|
||||||
val writeback = new Area{
|
val writeback = new Area{
|
||||||
val input = round.output.combStage
|
val input = roundBack.output.stage()
|
||||||
|
|
||||||
for(i <- 0 until portCount){
|
for(i <- 0 until portCount){
|
||||||
completion(i).increments += (RegNext(input.fire && input.source === i) init(False))
|
completion(i).increments += (RegNext(input.fire && input.source === i) init(False))
|
||||||
|
@ -1393,12 +1477,7 @@ object FpuSynthesisBench extends App{
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
//Fpu_32 ->
|
|
||||||
//Artix 7 -> 46 Mhz 1786 LUT 628 FF
|
|
||||||
//Artix 7 -> 47 Mhz 1901 LUT 628 FF
|
|
||||||
//Fpu_64 ->
|
|
||||||
//Artix 7 -> 37 Mhz 3407 LUT 1006 FF
|
|
||||||
//Artix 7 -> 36 Mhz 3564 LUT 1006 FF
|
|
||||||
|
|
||||||
val rtls = ArrayBuffer[Rtl]()
|
val rtls = ArrayBuffer[Rtl]()
|
||||||
rtls += new Fpu(
|
rtls += new Fpu(
|
||||||
|
|
|
@ -109,6 +109,8 @@ object FpuRoundModeInstr extends SpinalEnum(){
|
||||||
|
|
||||||
|
|
||||||
case class FpuParameter( withDouble : Boolean,
|
case class FpuParameter( withDouble : Boolean,
|
||||||
|
mulWidthA : Int = 18,
|
||||||
|
mulWidthB : Int = 18,
|
||||||
sim : Boolean = false,
|
sim : Boolean = false,
|
||||||
withAdd : Boolean = true,
|
withAdd : Boolean = true,
|
||||||
withMul : Boolean = true,
|
withMul : Boolean = true,
|
||||||
|
|
Loading…
Reference in New Issue