Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ This README only contains a brief overview of the library's current contents. Al

Utilizing Chisel and ChiselSim, `approx` requires a suitable installation of Scala. For this purpose, we use the Scala Build Tool (`sbt`) for which we provide a suitable build script. The provided tests require a recent version of Verilator.

This library is tested in Ubuntu 24.04 with Verilator 5.032. Note that the default Verilator version (5.020) available through `apt` in Ubunty 24.04 is _not_ new enough.
This library is tested in Ubuntu 24.04 with Verilator 5.032. Note that the default Verilator version (5.020) available through `apt` in Ubunty 24.04 is _not_ new enough. If you wish to have VCD dumps from the simulations, pass the `emitVcd` flag to `testOnly`, for example:

```bash
sbt "testOnly approx.addition.RCASpec -- -DemitVcd=1"
```

***
# Adders
Expand Down
166 changes: 119 additions & 47 deletions src/main/scala/approx/accumulation/Exact.scala
Original file line number Diff line number Diff line change
@@ -1,65 +1,101 @@
package approx.accumulation

import chisel3._
import chisel3.util.RegEnable
import chisel3.util.experimental.FlattenInstance

import approx.util.PRShiftReg
import approx.multiplication.comptree.{Approximation, Signature, CompressorTree}

/** Simple accumulator
*
* @param inW the width of the input operand
* @param accW the width of the accumulator
* @param signed whether the input operands are signed (defaults to false)
* @param pipes the number of pipeline stages (defaults to 0)
*
* Pipelining relies on retiming!
*/
class SimpleAccumulator(inW: Int, accW: Int, signed: Boolean = false) extends SA(inW, accW, signed) {
class SimpleAccumulator(inW: Int, accW: Int, signed: Boolean = false, pipes: Int = 0)
extends SA(inW, accW, signed, pipes) {
// Extend the input to the width of the accumulator if needed
val inExt = if (inW < accW) {
val sext = if (signed) VecInit(Seq.fill(accW - inW)(io.in(inW-1))).asUInt else 0.U((accW - inW).W)
sext ## io.in
} else io.in(accW-1, 0)

val acc = RegInit(0.U(accW.W))
acc := inExt + Mux(io.zero, 0.U, acc)
// Pass the extended input through a series of registers
val dataShReg = Module(new PRShiftReg(UInt(accW.W), pipes))
dataShReg.io.in := inExt

// Pass enable and zero through a shift register as needed
val enShReg = Module(new PRShiftReg(Bool(), pipes))
enShReg.io.in := io.en
val zeroShReg = Module(new PRShiftReg(Bool(), pipes))
zeroShReg.io.in := io.zero

// Compute the sum and register the accumulator
val sum = Wire(UInt(accW.W))
val acc = RegEnable(sum, 0.U(accW.W), enShReg.io.out.last)
sum := dataShReg.io.out.last + Mux(zeroShReg.io.out.last, 0.U, acc)
io.acc := acc
}

/** Multiply accumulator
*
* @param inW the width of the input operands
* @param inAW the width of the first input operand
* @param inBW the width of the second input operand
* @param accW the width of the accumulator
* @param signed whether the input operands are signed (defaults to false)
* @param pipes the number of pipeline stages (defaults to 0)
*
* @todo Extend with different signs and operand bit-widths.
* Pipelining relies on retiming!
*/
class MultiplyAccumulator(inW: Int, accW: Int, signed: Boolean = false) extends MAC(inW, accW, signed) {
class MultiplyAccumulator(inAW: Int, inBW: Int, accW: Int, signed: Boolean = false, pipes: Int = 0)
extends MAC(inAW, inBW, accW, signed, pipes) {
// Compute and extend the product to the width of the accumulator if needed
val prodExt = if (2 * inW < accW) {
val prodExt = if ((inAW + inBW) < accW) {
val prod = if (signed) (io.a.asSInt * io.b.asSInt).asUInt else io.a * io.b
val sext = if (signed) VecInit(Seq.fill(accW - 2 * inW)(prod(2*inW-1))).asUInt else 0.U((accW - 2 * inW).W)
val sext = if (signed) VecInit(Seq.fill(accW - inAW - inBW)(prod(inAW + inBW - 1))).asUInt else 0.U((accW - inAW - inBW).W)
sext ## prod
} else {
(if (signed) (io.a.asSInt * io.b.asSInt) else (io.a * io.b))(accW-1, 0)
}

val acc = RegInit(0.U(accW.W))
acc := prodExt + Mux(io.zero, 0.U, acc)
// Pass the extended product through a series of registers
val dataShReg = Module(new PRShiftReg(UInt(accW.W), pipes))
dataShReg.io.in := prodExt

// Pass enable and zero through a shift register as needed
val enShReg = Module(new PRShiftReg(Bool(), pipes))
enShReg.io.in := io.en
val zeroShReg = Module(new PRShiftReg(Bool(), pipes))
zeroShReg.io.in := io.zero

// Compute the sum and register the accumulator
val sum = Wire(UInt(accW.W))
val acc = RegEnable(sum, 0.U(accW.W), enShReg.io.out.last)
sum := dataShReg.io.out.last + Mux(zeroShReg.io.out.last, 0.U, acc)
io.acc := acc
}

/** Bit matrix accumulator
*
* @param sig the input bit matrix' signature
* @param accW the width of the accumulator
* @param pipes the number of pipeline stages (defaults to 0)
* @param targetDevice a string indicating the target device
* (defaults to "", meaning ASIC)
* @param mtrc which metric to use for selecting counters (defaults to efficiency)
* @param approx the targeted approximation styles (defaults to no approximation)
*
* Pipelining relies on retiming!
*
* @todo Consider building pipelining into the compressor tree generator.
*/
class BitMatrixAccumulator(sig: Signature, accW: Int, targetDevice: String = "",
class BitMatrixAccumulator(sig: Signature, accW: Int, pipes: Int = 0, targetDevice: String = "",
mtrc: Char = 'e', approx: Seq[Approximation] = Seq.empty[Approximation])
extends MxAC(sig, accW) with FlattenInstance {
val acc = RegInit(0.U(accW.W))

extends MxAC(sig, accW, pipes) with FlattenInstance {
// Add the accumulator to the input signature
val sigExt = new Signature((0 until scala.math.max(accW, sig.length)).map { c =>
val sigCnt = if (c < sig.length) sig.signature(c) else 0
Expand All @@ -68,7 +104,8 @@ class BitMatrixAccumulator(sig: Signature, accW: Int, targetDevice: String = "",
}.toArray)

// Build a compressor tree and assign its inputs and outputs
val comp = Module(CompressorTree(sigExt, targetDevice=targetDevice, mtrc=mtrc, approx=approx))
val acc = Wire(UInt(accW.W))
val comp = Module(CompressorTree(sigExt, targetDevice=targetDevice, mtrc=mtrc, approx=approx))
val compIns = Wire(Vec(sigExt.count, Bool()))
var (inOffset, compOffset) = (0, 0)
(0 until scala.math.max(accW, sig.length)).foreach { c =>
Expand All @@ -88,9 +125,19 @@ class BitMatrixAccumulator(sig: Signature, accW: Int, targetDevice: String = "",
}
}

// Pass the compressor output through a series of registers
val dataShReg = Module(new PRShiftReg(UInt(accW.W), pipes))
comp.io.in := compIns.asUInt
acc := comp.io.out
io.acc := acc
dataShReg.io.in := comp.io.out

// Pass enable through a shift register as needed
val enShReg = Module(new PRShiftReg(Bool(), pipes))
enShReg.io.in := io.en

// Compute the sum and register the accumulator
val accReg = RegEnable(comp.io.out, 0.U(accW.W), enShReg.io.out.last)
acc := accReg
io.acc := accReg
}

/** Parallel simple accumulator
Expand All @@ -99,15 +146,19 @@ class BitMatrixAccumulator(sig: Signature, accW: Int, targetDevice: String = "",
* @param inW the width of the input operands
* @param accW the width of the accumulator
* @param signed whether the input operands are signed (defaults to false)
* @param pipes the number of pipeline stages (defaults to 0)
* @param comp whether to use the compressor tree generator (defaults to false)
* @param targetDevice a string indicating the target device
* (defaults to "", meaning ASIC)
* @param mtrc which metric to use for selecting counters (defaults to efficiency)
* @param approx the targeted approximation styles (defaults to no approximation)
*
* Pipelining relies on retiming!
*/
class ParallelSimpleAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean = false,
comp: Boolean = false, targetDevice: String = "", mtrc: Char = 'e', approx: Seq[Approximation] = Seq.empty[Approximation])
extends PSA(nIn, inW, accW, signed) with FlattenInstance {
pipes: Int = 0, comp: Boolean = false, targetDevice: String = "", mtrc: Char = 'e',
approx: Seq[Approximation] = Seq.empty[Approximation])
extends PSA(nIn, inW, accW, signed, pipes) with FlattenInstance {
// Extend the inputs to the width of the accumulator if needed
val insExt = if (inW < accW) {
if (signed) {
Expand All @@ -123,49 +174,60 @@ class ParallelSimpleAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean =
val sig = new Signature(Array.fill(extW)(nIn))

// Build a bit matrix accumulator and assign its inputs and outputs
val mxAcc = Module(new BitMatrixAccumulator(sig, accW, targetDevice, mtrc, approx))
val mxAcc = Module(new BitMatrixAccumulator(sig, accW, pipes, targetDevice, mtrc, approx))
val accIns = VecInit((0 until extW).flatMap { c => (0 until nIn).map(i => insExt(i)(c)) }).asUInt

mxAcc.io.en := io.en
mxAcc.io.zero := io.zero
mxAcc.io.in := accIns
io.acc := mxAcc.io.acc
} else {
// Instantiate an accumulator register
val acc = RegInit(0.U(accW.W))
// Pass the parallel sum through a series of registers
val dataShReg = Module(new PRShiftReg(UInt(accW.W), pipes))
dataShReg.io.in := insExt.reduceTree(_ +& _)

// Pass enable and zero through a shift register as needed
val enShReg = Module(new PRShiftReg(Bool(), pipes))
enShReg.io.in := io.en
val zeroShReg = Module(new PRShiftReg(Bool(), pipes))
zeroShReg.io.in := io.zero

// Connect and sum the extended inputs
acc := insExt.reduceTree(_ +& _) + Mux(io.zero, 0.U, acc)
// Compute the sum and register the accumulator
val sum = Wire(UInt(accW.W))
val acc = RegEnable(sum, 0.U(accW.W), enShReg.io.out.last)
sum := dataShReg.io.out.last + Mux(zeroShReg.io.out.last, 0.U, acc)
io.acc := acc
}
}

/** Parallel multiply accumulator
*
* @param nIn the number of parallel input operands
* @param inW the width of the input operands
* @param inAW the width of the first input operands
* @param inBW the width of the second input operands
* @param accW the width of the accumulator
* @param signed whether the input operands are signed (defaults to false)
* @param pipes the number of pipeline stages (defaults to 0)
* @param comp whether to use the compressor tree generator (defaults to false)
* @param targetDevice a string indicating the target device
* (defaults to "", meaning ASIC)
* @param mtrc which metric to use for selecting counters (defaults to efficiency)
* @param approx the targeted approximation styles (defaults to no approximation)
*
* @todo Extend with different signs and operand bit-widths.
* Pipelining relies on retiming!
*/
class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean = false,
comp: Boolean = false, targetDevice: String = "", mtrc: Char = 'e', approx: Seq[Approximation] = Seq.empty[Approximation])
extends PMAC(nIn, inW, accW, signed) with FlattenInstance {
val aW = io.as.head.getWidth
val bW = io.bs.head.getWidth
class ParallelMultiplyAccumulator(nIn: Int, inAW: Int, inBW: Int, accW: Int, signed: Boolean = false,
pipes: Int = 0, comp: Boolean = false, targetDevice: String = "", mtrc: Char = 'e',
approx: Seq[Approximation] = Seq.empty[Approximation])
extends PMAC(nIn, inAW, inBW, accW, signed, pipes) with FlattenInstance {

// Depending on the parameters passed, generate a naive accumulator or use
// the custom compressor tree generator
if (comp) {
// Compute some constants and generate the sign-extension constant
val midLo = scala.math.min(aW, bW) - 1
val midHi = scala.math.max(aW, bW) - 1
val upper = aW + bW - 1
val midLo = scala.math.min(inAW, inBW) - 1
val midHi = scala.math.max(inAW, inBW) - 1
val upper = inAW + inBW - 1
val extConst = if (signed) Seq.fill(nIn) {
(BigInt(-1) << upper) + (BigInt(1) << midLo) + (BigInt(1) << midHi)
}.sum else BigInt(0)
Expand All @@ -177,7 +239,7 @@ class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean
*/
def dotCount(col: Int): Int = {
if (col < midLo) col + 1
else if (midLo <= col && col <= midHi) scala.math.min(aW, bW)
else if (midLo <= col && col <= midHi) scala.math.min(inAW, inBW)
else if (col < upper) upper - col
else 0
}
Expand All @@ -188,7 +250,7 @@ class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean
* @param col the index of the column
* @return the index of the least significant row
*/
def lsRow(col: Int): Int = if (col < inW) 0 else (col - inW + 1)
def lsRow(col: Int): Int = if (col < inBW) 0 else (col - inBW + 1)

// Generate the signature of the needed compressor tree
val sig = new Signature((0 until scala.math.max(upper + 1, accW)).map { c =>
Expand All @@ -200,22 +262,22 @@ class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean
// Compute the partial products
val prods = if (signed) {
(0 until nIn).map { i =>
(0 until aW).map { r =>
val pprod = VecInit((0 until bW).map { c =>
(0 until inAW).map { r =>
val pprod = VecInit((0 until inBW).map { c =>
val dot = io.as(i)(r) & io.bs(i)(c)
if (c == (bW - 1)) !dot else dot
if (c == (inBW - 1)) !dot else dot
}).asUInt
if (r == (aW - 1)) ~pprod else pprod
if (r == (inAW - 1)) ~pprod else pprod
}
}
} else {
(0 until nIn).map { i =>
(0 until aW).map { r => VecInit(Seq.fill(bW)(io.as(i)(r))).asUInt & io.bs(i) }
(0 until inAW).map { r => VecInit(Seq.fill(inBW)(io.as(i)(r))).asUInt & io.bs(i) }
}
}

// Build a bit matrix accumulator and assign its inputs and outputs
val mxAcc = Module(new BitMatrixAccumulator(sig, accW, targetDevice, mtrc, approx))
val mxAcc = Module(new BitMatrixAccumulator(sig, accW, pipes, targetDevice, mtrc, approx))
val accIns = Wire(Vec(sig.count, Bool()))
var compOffset = 0
(0 until sig.length).foreach { c =>
Expand All @@ -236,15 +298,16 @@ class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean
}
}

mxAcc.io.en := io.en
mxAcc.io.zero := io.zero
mxAcc.io.in := accIns.asUInt
io.acc := mxAcc.io.acc
} else {
// Compute and sign-extend the incoming products as needed
val prodsExt = if (aW + bW < accW) {
val prodsExt = if ((inAW + inBW) < accW) {
VecInit(io.as.zip(io.bs).map { case (a, b) =>
val prod = if (signed) (a.asSInt * b.asSInt).asUInt else (a * b)
val sext = if (signed) VecInit(Seq.fill(accW - aW - bW)(prod(aW+bW-1))).asUInt else 0.U((accW - aW - bW).W)
val sext = if (signed) VecInit(Seq.fill(accW - inAW - inBW)(prod(inAW - inBW - 1))).asUInt else 0.U((accW - inAW - inBW).W)
sext ## prod
})
} else {
Expand All @@ -253,11 +316,20 @@ class ParallelMultiplyAccumulator(nIn: Int, inW: Int, accW: Int, signed: Boolean
})
}

// Instantiate an accumulator register
val acc = RegInit(0.U(accW.W))
// Pass the parallel sum through a series of registers
val dataShReg = Module(new PRShiftReg(UInt(accW.W), pipes))
dataShReg.io.in := prodsExt.reduceTree(_ +& _)

// Pass enable and zero through a shift register as needed
val enShReg = Module(new PRShiftReg(Bool(), pipes))
enShReg.io.in := io.en
val zeroShReg = Module(new PRShiftReg(Bool(), pipes))
zeroShReg.io.in := io.zero

// Connect and sum the extended products
acc := prodsExt.reduceTree(_ +& _) + Mux(io.zero, 0.U, acc)
val sum = Wire(UInt(accW.W))
val acc = RegEnable(sum, 0.U(accW.W), enShReg.io.out.last)
sum := dataShReg.io.out.last + Mux(zeroShReg.io.out.last, 0.U, acc)
io.acc := acc
}
}
Loading