diff --git a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala
index 0a8c3ec..38506f2 100644
--- a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala
+++ b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpCluster.scala
@@ -54,8 +54,9 @@ case class VexRiscvSmpCluster(p : VexRiscvSmpClusterParameter,
     val core = new VexRiscv(cpuConfig)
     core.plugins.foreach {
       case plugin: IBusCachedPlugin => iBus = plugin.iBus.toBmb()
-      case plugin: DBusCachedPlugin => dBus = plugin.dBus.toBmb()
+      case plugin: DBusCachedPlugin => dBus = plugin.dBus.toBmb().pipelined(cmdValid = true)
       case plugin: CsrPlugin => {
+        plugin.externalMhartId := cpuId
         plugin.softwareInterrupt := io.softwareInterrupts(cpuId)
         plugin.externalInterrupt := io.externalInterrupts(cpuId)
         plugin.timerInterrupt := io.timerInterrupts(cpuId)
@@ -112,9 +113,12 @@ case class VexRiscvSmpCluster(p : VexRiscvSmpClusterParameter,
 
 
 object VexRiscvSmpClusterGen {
-  def vexRiscvConfig(hartId : Int,
+  def vexRiscvConfig(hartIdWidth : Int,
+                     hartId : Int,
                      ioRange : UInt => Bool = (x => x(31 downto 28) === 0xF),
                      resetVector : Long = 0x80000000l) = {
+    val iBusWidth = 128
+    val dBusWidth = 64
     val config = VexRiscvConfig(
       plugins = List(
         new MmuPlugin(
@@ -135,7 +139,7 @@ object VexRiscvSmpClusterGen {
             wayCount = 2,
             addressWidth = 32,
             cpuDataWidth = 32,
-            memDataWidth = 128,
+            memDataWidth = iBusWidth,
             catchIllegalAccess = true,
             catchAccessFault = true,
             asyncTagMemory = false,
@@ -151,7 +155,7 @@ object VexRiscvSmpClusterGen {
           )
         ),
         new DBusCachedPlugin(
-          dBusCmdMasterPipe = true,
+          dBusCmdMasterPipe = dBusWidth == 32,
           dBusCmdSlavePipe = true,
           dBusRspSlavePipe = true,
           relaxedMemoryTranslationRegister = true,
@@ -161,14 +165,15 @@ object VexRiscvSmpClusterGen {
             wayCount          = 1,
             addressWidth      = 32,
             cpuDataWidth      = 32,
-            memDataWidth      = 32,
+            memDataWidth      = dBusWidth,
             catchAccessError  = true,
             catchIllegal      = true,
             catchUnaligned    = true,
             withLrSc = true,
             withAmo = true,
             withExclusive = true,
-            withInvalidate = true
+            withInvalidate = true,
+            aggregationWidth = if(dBusWidth == 32) 0 else log2Up(dBusWidth/8)
             //          )
           ),
           memoryTranslatorPortConfig = MmuPortConfig(
@@ -208,7 +213,7 @@ object VexRiscvSmpClusterGen {
           mulUnrollFactor = 32,
           divUnrollFactor = 1
         ),
-        new CsrPlugin(CsrPluginConfig.openSbi(hartId = hartId, misa = Riscv.misaToInt("imas"))),
+        new CsrPlugin(CsrPluginConfig.openSbi(misa = Riscv.misaToInt("imas")).copy(withExternalMhartid = true, mhartidWidth = hartIdWidth)),
         new BranchPlugin(
           earlyBranch = false,
           catchAddressMisaligned = true,
@@ -224,7 +229,7 @@ object VexRiscvSmpClusterGen {
     debugClockDomain = ClockDomain.current.copy(reset = Bool().setName("debugResetIn")),
     p = VexRiscvSmpClusterParameter(
       cpuConfigs = List.tabulate(cpuCount) {
-        vexRiscvConfig(_, resetVector = resetVector)
+        vexRiscvConfig(log2Up(cpuCount), _, resetVector = resetVector)
       }
     )
   )
@@ -462,7 +467,7 @@ object VexRiscvSmpClusterOpenSbi extends App{
   simConfig.allOptimisation
   simConfig.addSimulatorFlag("--threads 1")
 
-  val cpuCount = 1
+  val cpuCount = 4
   val withStall = false
 
   def gen = {
@@ -573,8 +578,8 @@ object VexRiscvSmpClusterOpenSbi extends App{
 
 //    fork{
 //      disableSimWave()
-//      val atMs = 130
-//      val durationMs = 15
+//      val atMs = 3790
+//      val durationMs = 5
 //      sleep(atMs*1000000)
 //      enableSimWave()
 //      println("** enableSimWave **")
diff --git a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpLitexCluster.scala b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpLitexCluster.scala
index 03ccc89..3f3047f 100644
--- a/src/main/scala/vexriscv/demo/smp/VexRiscvSmpLitexCluster.scala
+++ b/src/main/scala/vexriscv/demo/smp/VexRiscvSmpLitexCluster.scala
@@ -73,13 +73,13 @@ case class LiteDramNative(p : LiteDramNativeParameter) extends Bundle with IMast
     }
 
     var writeCmdCounter, writeDataCounter = 0
-    StreamReadyRandomizer(bus.cmd, cd)
+    StreamReadyRandomizer(bus.cmd, cd).factor = 0.5f
     StreamMonitor(bus.cmd, cd) { t =>
       cmdQueue.enqueue(Cmd(t.addr.toLong * (p.dataWidth/8) , t.we.toBoolean))
       if(t.we.toBoolean) writeCmdCounter += 1
     }
 
-    StreamReadyRandomizer(bus.wdata, cd)
+    StreamReadyRandomizer(bus.wdata, cd).factor = 0.5f
     StreamMonitor(bus.wdata, cd) { p =>
       writeDataCounter += 1
 //      if(p.data.toBigInt == BigInt("00000002000000020000000200000002",16)){
@@ -175,16 +175,19 @@ case class BmbToLiteDram(bmbParameter : BmbParameter,
 
   val halt = Bool()
   val (cmdFork, dataFork) = StreamFork2(unburstified.cmd.haltWhen(halt))
-  io.output.cmd.arbitrationFrom(cmdFork.haltWhen(pendingRead.msb))
-  io.output.cmd.addr := (cmdFork.address >> log2Up(liteDramParameter.dataWidth/8)).resized
-  io.output.cmd.we := cmdFork.isWrite
+  val outputCmd =  Stream(LiteDramNativeCmd(liteDramParameter))
+  outputCmd.arbitrationFrom(cmdFork.haltWhen(pendingRead.msb))
+  outputCmd.addr := (cmdFork.address >> log2Up(liteDramParameter.dataWidth/8)).resized
+  outputCmd.we := cmdFork.isWrite
+
+  io.output.cmd <-< outputCmd
 
   if(bmbParameter.canWrite) {
     val wData = Stream(LiteDramNativeWData(liteDramParameter))
     wData.arbitrationFrom(dataFork.throwWhen(dataFork.isRead))
     wData.data := dataFork.data
     wData.we := dataFork.mask
-    io.output.wdata << wData.queue(wdataFifoSize)
+    io.output.wdata << wData.queueLowLatency(wdataFifoSize, latency = 1) //TODO queue low latency
   } else {
     dataFork.ready := True
     io.output.wdata.valid := False
@@ -212,7 +215,7 @@ case class BmbToLiteDram(bmbParameter : BmbParameter,
   unburstified.rsp.data := rdataFifo.data
 
 
-  pendingRead := pendingRead + U(io.output.cmd.fire && !io.output.cmd.we) - U(rdataFifo.fire)
+  pendingRead := pendingRead + U(outputCmd.fire && !outputCmd.we) - U(rdataFifo.fire)
 }
 
 object BmbToLiteDramTester extends App{
@@ -241,6 +244,7 @@ case class VexRiscvLitexSmpClusterParameter( cluster : VexRiscvSmpClusterParamet
                                              liteDram : LiteDramNativeParameter,
                                              liteDramMapping : AddressMapping)
 
+//addAttribute("""mark_debug = "true"""")
 case class VexRiscvLitexSmpCluster(p : VexRiscvLitexSmpClusterParameter,
                                    debugClockDomain : ClockDomain) extends Component{
 
@@ -308,50 +312,59 @@ case class VexRiscvLitexSmpCluster(p : VexRiscvLitexSmpClusterParameter,
   iBusDecoder.io.input << iBusArbiter.io.output.pipelined(cmdValid = true)
 
   val iMem = LiteDramNative(p.liteDram)
-  val iMemBridge = iMem.fromBmb(iBusDecoder.io.outputs(1), wdataFifoSize = 0, rdataFifoSize = 32)
-  iMem.cmd   >-> io.iMem.cmd
-  iMem.wdata >> io.iMem.wdata
-  iMem.rdata << io.iMem.rdata
+  io.iMem.fromBmb(iBusDecoder.io.outputs(1), wdataFifoSize = 0, rdataFifoSize = 32)
 
 
+  val iBusDecoderToPeripheral = iBusDecoder.io.outputs(0).resize(dataWidth = 32).pipelined(cmdHalfRate = true, rspValid = true)
+  val dBusDecoderToPeripheral = dBusDecoder.io.outputs(0).resize(dataWidth = 32).pipelined(cmdHalfRate = true, rspValid = true)
+
   val peripheralAccessLength = Math.max(iBusDecoder.io.outputs(0).p.lengthWidth, dBusDecoder.io.outputs(0).p.lengthWidth)
   val peripheralArbiter = BmbArbiter(
-    p = dBusDecoder.io.outputs(0).p.copy(sourceWidth = dBusDecoder.io.outputs(0).p.sourceWidth + 1, lengthWidth = peripheralAccessLength),
+    p = dBusDecoder.io.outputs(0).p.copy(
+      sourceWidth = List(iBusDecoderToPeripheral, dBusDecoderToPeripheral).map(_.p.sourceWidth).max + 1,
+      contextWidth = List(iBusDecoderToPeripheral, dBusDecoderToPeripheral).map(_.p.contextWidth).max,
+      lengthWidth = peripheralAccessLength,
+      dataWidth = 32
+    ),
     portCount = 2,
     lowerFirstPriority = true
   )
-  peripheralArbiter.io.inputs(0) << iBusDecoder.io.outputs(0).resize(dataWidth = 32).pipelined(cmdHalfRate = true, rspValid = true)
-  peripheralArbiter.io.inputs(1) << dBusDecoder.io.outputs(0).resize(dataWidth = 32).pipelined(cmdHalfRate = true, rspValid = true)
+  peripheralArbiter.io.inputs(0) << iBusDecoderToPeripheral
+  peripheralArbiter.io.inputs(1) << dBusDecoderToPeripheral
 
   val peripheralWishbone = peripheralArbiter.io.output.pipelined(cmdValid = true).toWishbone()
   io.peripheral << peripheralWishbone
 }
 
 object VexRiscvLitexSmpClusterGen extends App {
-  val cpuCount = 4
+  for(cpuCount <- List(1,2,4,8)) {
+    def parameter = VexRiscvLitexSmpClusterParameter(
+      cluster = VexRiscvSmpClusterParameter(
+        cpuConfigs = List.tabulate(cpuCount) { hartId =>
+          vexRiscvConfig(
+            hartIdWidth = log2Up(cpuCount),
+            hartId = hartId,
+            ioRange = address => address.msb,
+            resetVector = 0
+          )
+        }
+      ),
+      liteDram = LiteDramNativeParameter(addressWidth = 32, dataWidth = 128),
+      liteDramMapping = SizeMapping(0x40000000l, 0x40000000l)
+    )
 
-  def parameter = VexRiscvLitexSmpClusterParameter(
-    cluster = VexRiscvSmpClusterParameter(
-      cpuConfigs = List.tabulate(cpuCount) { hartId =>
-        vexRiscvConfig(
-          hartId = hartId,
-          ioRange =  address => address.msb,
-          resetVector = 0
-        )
-      }
-    ),
-    liteDram = LiteDramNativeParameter(addressWidth = 32, dataWidth = 128),
-    liteDramMapping = SizeMapping(0x40000000l, 0x40000000l)
-  )
+    def dutGen = {
+      val toplevel = VexRiscvLitexSmpCluster(
+        p = parameter,
+        debugClockDomain = ClockDomain.current.copy(reset = Bool().setName("debugResetIn"))
+      )
+      toplevel
+    }
 
-  def dutGen = VexRiscvLitexSmpCluster(
-    p = parameter,
-    debugClockDomain = ClockDomain.current.copy(reset = Bool().setName("debugResetIn"))
-  )
-
-  val genConfig = SpinalConfig().addStandardMemBlackboxing(blackboxByteEnables)
-//  genConfig.generateVerilog(Bench.compressIo(dutGen))
-  genConfig.generateVerilog(dutGen)
+    val genConfig = SpinalConfig().addStandardMemBlackboxing(blackboxByteEnables)
+    //  genConfig.generateVerilog(Bench.compressIo(dutGen))
+    genConfig.generateVerilog(dutGen.setDefinitionName(s"VexRiscvLitexSmpCluster_${cpuCount}c"))
+  }
 
 }
 
@@ -363,13 +376,13 @@ object VexRiscvLitexSmpClusterOpenSbi extends App{
     simConfig.withWave
     simConfig.allOptimisation
 
-    val cpuCount = 4
-    val withStall = false
+    val cpuCount = 8
 
     def parameter = VexRiscvLitexSmpClusterParameter(
       cluster = VexRiscvSmpClusterParameter(
         cpuConfigs = List.tabulate(cpuCount) { hartId =>
           vexRiscvConfig(
+            hartIdWidth = log2Up(cpuCount),
             hartId = hartId,
             ioRange =  address => address(31 downto 28) === 0xF,
             resetVector = 0x80000000l
@@ -440,12 +453,12 @@ object VexRiscvLitexSmpClusterOpenSbi extends App{
 
 //      fork{
 //        disableSimWave()
-//        val atMs = 8
-//        val durationMs = 3
-//        sleep(atMs*1000000)
+//        val atMs = 3790
+//        val durationMs = 5
+//        sleep(atMs*1000000l)
 //        enableSimWave()
 //        println("** enableSimWave **")
-//        sleep(durationMs*1000000)
+//        sleep(durationMs*1000000l)
 //        println("** disableSimWave **")
 //        while(true) {
 //          disableSimWave()
@@ -453,7 +466,7 @@ object VexRiscvLitexSmpClusterOpenSbi extends App{
 //          enableSimWave()
 //          sleep(  100 * 10)
 //        }
-//  //      simSuccess()
+//        //      simSuccess()
 //      }
 
       fork{
diff --git a/src/main/scala/vexriscv/ip/DataCache.scala b/src/main/scala/vexriscv/ip/DataCache.scala
index 2f2e8c2..82fa3af 100644
--- a/src/main/scala/vexriscv/ip/DataCache.scala
+++ b/src/main/scala/vexriscv/ip/DataCache.scala
@@ -5,7 +5,7 @@ import spinal.core._
 import spinal.lib._
 import spinal.lib.bus.amba4.axi.{Axi4Config, Axi4Shared}
 import spinal.lib.bus.avalon.{AvalonMM, AvalonMMConfig}
-import spinal.lib.bus.bmb.{Bmb, BmbParameter}
+import spinal.lib.bus.bmb.{Bmb, BmbCmd, BmbParameter}
 import spinal.lib.bus.wishbone.{Wishbone, WishboneConfig}
 import spinal.lib.bus.simple._
 import vexriscv.plugin.DBusSimpleBus
@@ -29,7 +29,8 @@ case class DataCacheConfig(cacheSize : Int,
                            withInvalidate : Boolean = false,
                            pendingMax : Int = 32,
                            directTlbHit : Boolean = false,
-                           mergeExecuteMemory : Boolean = false){
+                           mergeExecuteMemory : Boolean = false,
+                           aggregationWidth : Int = 0){
   assert(!(mergeExecuteMemory && (earlyDataMux || earlyWaysHits)))
   assert(!(earlyDataMux && !earlyWaysHits))
   assert(isPow2(pendingMax))
@@ -41,6 +42,8 @@ case class DataCacheConfig(cacheSize : Int,
   def withInternalLrSc = withLrSc && !withExclusive
   def withExternalLrSc = withLrSc && withExclusive
   def withExternalAmo = withAmo && withExclusive
+  def cpuDataBytes = cpuDataWidth/8
+  def memDataBytes = memDataWidth/8
   def getAxi4SharedConfig() = Axi4Config(
     addressWidth = addressWidth,
     dataWidth = memDataWidth,
@@ -79,10 +82,10 @@ case class DataCacheConfig(cacheSize : Int,
 
   def getBmbParameter() = BmbParameter(
     addressWidth = 32,
-    dataWidth = 32,
+    dataWidth = memDataWidth,
     lengthWidth = log2Up(this.bytePerLine),
     sourceWidth = 0,
-    contextWidth = if(!withWriteResponse) 1 else 0,
+    contextWidth = (if(!withWriteResponse) 1 else 0) + (if(cpuDataWidth != memDataWidth) log2Up(memDataBytes) else 0),
     canRead = true,
     canWrite = true,
     alignment  = BmbParameter.BurstAlignement.LENGTH,
@@ -203,6 +206,7 @@ case class DataCacheMemCmd(p : DataCacheConfig) extends Bundle{
   val last = Bool
 }
 case class DataCacheMemRsp(p : DataCacheConfig) extends Bundle{
+  val aggregated = UInt(p.aggregationWidth bits)
   val last = Bool()
   val data = Bits(p.memDataWidth bit)
   val error = Bool
@@ -217,7 +221,7 @@ case class DataCacheAck(p : DataCacheConfig) extends Bundle{
 }
 
 case class DataCacheSync(p : DataCacheConfig) extends Bundle{
-
+  val aggregated = UInt(p.aggregationWidth bits)
 }
 
 case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave{
@@ -369,21 +373,133 @@ case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave
   }
 
 
-  def toBmb() : Bmb = {
+  def toBmb(syncPendingMax : Int = 16,
+            timeoutCycles : Int = 16) : Bmb = new Area{
+    setCompositeName(DataCacheMemBus.this, "Bridge", true)
     val pipelinedMemoryBusConfig = p.getBmbParameter()
     val bus = Bmb(pipelinedMemoryBusConfig).setCompositeName(this,"toBmb", true)
+    val aggregationMax = p.memDataBytes
 
-    bus.cmd.valid := cmd.valid
-    bus.cmd.last := cmd.last
-    if(!p.withWriteResponse) bus.cmd.context(0) := cmd.wr
-    bus.cmd.opcode := (cmd.wr ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
-    bus.cmd.address := cmd.address.resized
-    bus.cmd.data := cmd.data
-    bus.cmd.length := (cmd.length << 2) | 3 //TODO better sub word access
-    bus.cmd.mask := cmd.mask
-    if(p.withExclusive) bus.cmd.exclusive := cmd.exclusive
+    case class Context() extends Bundle{
+      val isWrite = !p.withWriteResponse generate Bool()
+      val rspCount = (p.cpuDataWidth != p.memDataWidth) generate UInt(log2Up(aggregationMax) bits)
+    }
+
+    val withoutWriteBuffer = if(p.cpuDataWidth == p.memDataWidth) new Area {
+      val busCmdContext = Context()
+
+      bus.cmd.valid := cmd.valid
+      bus.cmd.last := cmd.last
+      bus.cmd.opcode := (cmd.wr ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+      bus.cmd.address := cmd.address.resized
+      bus.cmd.data := cmd.data
+      bus.cmd.length := (cmd.length << 2) | 3
+      bus.cmd.mask := cmd.mask
+      if (p.withExclusive) bus.cmd.exclusive := cmd.exclusive
+      if (!p.withWriteResponse) busCmdContext.isWrite := cmd.wr
+      bus.cmd.context := B(busCmdContext)
+
+      cmd.ready := bus.cmd.ready
+      if(p.withInvalidate) sync.arbitrationFrom(bus.sync)
+    }
+
+    val withWriteBuffer = if(p.cpuDataWidth != p.memDataWidth) new Area {
+      val buffer = new Area {
+        val stream = cmd.toEvent().m2sPipe()
+        val address = Reg(UInt(p.addressWidth bits))
+        val length = Reg(UInt(pipelinedMemoryBusConfig.lengthWidth bits))
+        val write  = Reg(Bool)
+        val exclusive = Reg(Bool)
+        val data = Reg(Bits(p.memDataWidth bits))
+        val mask = Reg(Bits(p.memDataWidth/8 bits)) init(0)
+      }
+
+      val aggregationRange = log2Up(p.memDataWidth/8)-1 downto log2Up(p.cpuDataWidth/8)
+      val tagRange = p.addressWidth-1 downto aggregationRange.high+1
+      val aggregationEnabled = Reg(Bool)
+      val aggregationCounter = Reg(UInt(log2Up(aggregationMax) bits)) init(0)
+      val aggregationCounterFull = aggregationCounter === aggregationCounter.maxValue
+      val timer = Reg(UInt(log2Up(timeoutCycles)+1 bits)) init(0)
+      val timerFull = timer.msb
+      val hit = cmd.address(tagRange) === buffer.address(tagRange)
+      val canAggregate = cmd.valid && cmd.wr && !cmd.uncached && !cmd.exclusive && !timerFull && !aggregationCounterFull && (!buffer.stream.valid || aggregationEnabled && hit)
+      val doFlush = cmd.valid && !canAggregate || timerFull || aggregationCounterFull || !aggregationEnabled
+//      val canAggregate = False
+//      val doFlush = True
+      val busCmdContext = Context()
+      val halt = False
+
+      when(cmd.fire){
+        aggregationCounter := aggregationCounter + 1
+      }
+      when(buffer.stream.valid && !timerFull){
+        timer := timer + 1
+      }
+      when(bus.cmd.fire || !buffer.stream.valid){
+        buffer.mask := 0
+        aggregationCounter := 0
+        timer := 0
+      }
+
+      buffer.stream.ready := (bus.cmd.ready && doFlush || canAggregate) && !halt
+      bus.cmd.valid := buffer.stream.valid && doFlush && !halt
+      bus.cmd.last := True
+      bus.cmd.opcode := (buffer.write ? B(Bmb.Cmd.Opcode.WRITE) | B(Bmb.Cmd.Opcode.READ))
+      bus.cmd.address := buffer.address
+      bus.cmd.length := buffer.length
+      bus.cmd.data := buffer.data
+      bus.cmd.mask := buffer.mask
+
+      if (p.withExclusive) bus.cmd.exclusive := buffer.exclusive
+      bus.cmd.context.removeAssignments() := B(busCmdContext)
+      if (!p.withWriteResponse) busCmdContext.isWrite := bus.cmd.isWrite
+      busCmdContext.rspCount := aggregationCounter
+
+      val aggregationSel = cmd.address(aggregationRange)
+      when(cmd.fire){
+        val dIn = cmd.data.subdivideIn(8 bits)
+        val dReg = buffer.data.subdivideIn(8 bits)
+        for(byteId <- 0 until p.memDataBytes){
+          when(aggregationSel === byteId / p.cpuDataBytes && cmd.mask(byteId % p.cpuDataBytes)){
+            dReg.write(byteId, dIn(byteId % p.cpuDataBytes))
+            buffer.mask(byteId) := True
+          }
+        }
+      }
+
+      when(cmd.fire){
+        buffer.write := cmd.wr
+        buffer.address := cmd.address.resized
+        buffer.length := (cmd.length << 2) | 3
+        if (p.withExclusive) buffer.exclusive := cmd.exclusive
+
+        when(cmd.wr && !cmd.uncached && !cmd.exclusive){
+          aggregationEnabled := True
+          buffer.address(aggregationRange.high downto 0) := 0
+          buffer.length := p.memDataBytes-1
+        } otherwise {
+          aggregationEnabled := False
+        }
+      }
+
+
+      val rspCtx = bus.rsp.context.as(Context())
+      rsp.aggregated := rspCtx.rspCount
+
+      val syncLogic = p.withInvalidate generate new Area{
+        val cmdCtx = Stream(UInt(log2Up(aggregationMax) bits))
+        cmdCtx.valid := bus.cmd.fire && bus.cmd.isWrite
+        cmdCtx.payload := aggregationCounter
+        halt setWhen(!cmdCtx.ready)
+
+        val syncCtx = cmdCtx.queueLowLatency(syncPendingMax, latency = 1)
+        syncCtx.ready := bus.sync.fire
+
+        sync.arbitrationFrom(bus.sync)
+        sync.aggregated := syncCtx.payload
+      }
+    }
 
-    cmd.ready := bus.cmd.ready
 
     rsp.valid := bus.rsp.valid
     if(!p.withWriteResponse) rsp.valid clearWhen(bus.rsp.context(0))
@@ -399,21 +515,9 @@ case class DataCacheMemBus(p : DataCacheConfig) extends Bundle with IMasterSlave
       inv.enable  := bus.inv.all
 
       bus.ack.arbitrationFrom(ack)
-
-      sync.arbitrationFrom(bus.sync)
-
-//      bus.ack.arbitrationFrom(ack)
-//      //TODO manage lenght ?
-//      inv.address := bus.inv.address
-////      inv.opcode := bus.inv.opcode
-//      ???
-//
-//      bus.ack.arbitrationFrom(ack)
+      //      //TODO manage lenght ?
     }
-
-
-    bus
-  }
+  }.bus
 
 }
 
@@ -537,7 +641,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam
   val memCmdSent = RegInit(False) setWhen (io.mem.cmd.ready) clearWhen (!io.cpu.writeBack.isStuck)
   val pending = withExclusive generate new Area{
     val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
-    val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - U(io.mem.rsp.valid  && io.mem.rsp.last)
+    val counterNext = counter + U(io.mem.cmd.fire && io.mem.cmd.last) - ((io.mem.rsp.valid  && io.mem.rsp.last) ? (io.mem.rsp.aggregated +^ 1) | 0)
     counter := counterNext
 
     val done = RegNext(counterNext === 0)
@@ -554,7 +658,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam
 
   val sync = withInvalidate generate new Area{
     io.mem.sync.ready := True
-
+    val syncCount = io.mem.sync.aggregated +^ 1
     val syncContext = new Area{
       val history = Mem(Bool, pendingMax)
       val wPtr, rPtr = Reg(UInt(log2Up(pendingMax)+1 bits)) init(0)
@@ -564,7 +668,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam
       }
 
       when(io.mem.sync.fire){
-        rPtr := rPtr + 1
+        rPtr := rPtr + syncCount
       }
       val uncached = history.readAsync(rPtr.resized)
       val full = RegNext(wPtr - rPtr >= pendingMax-1)
@@ -573,7 +677,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam
 
     def pending(inc : Bool, dec : Bool) = new Area {
       val pendingSync = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
-      val pendingSyncNext = pendingSync + U(io.mem.cmd.fire && io.mem.cmd.wr && inc) - U(io.mem.sync.fire && dec)
+      val pendingSyncNext = pendingSync + U(io.mem.cmd.fire && io.mem.cmd.wr && inc) - ((io.mem.sync.fire && dec) ? syncCount | 0)
       pendingSync := pendingSyncNext
     }
 
@@ -582,7 +686,7 @@ class DataCache(val p : DataCacheConfig, mmuParameter : MemoryTranslatorBusParam
 
     def track(load : Bool, uncached : Boolean) = new Area {
       val counter = Reg(UInt(log2Up(pendingMax) + 1 bits)) init(0)
-      counter := counter - U(io.mem.sync.fire && counter =/= 0 && (if(uncached) syncContext.uncached else !syncContext.uncached))
+      counter := counter - ((io.mem.sync.fire && counter =/= 0 && (if(uncached) syncContext.uncached else !syncContext.uncached)) ? syncCount | 0)
       when(load){ counter := (if(uncached) writeUncached.pendingSyncNext else writeCached.pendingSyncNext) }
 
       val busy = counter =/= 0
diff --git a/src/main/scala/vexriscv/plugin/CsrPlugin.scala b/src/main/scala/vexriscv/plugin/CsrPlugin.scala
index 43dbfaf..23f3323 100644
--- a/src/main/scala/vexriscv/plugin/CsrPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/CsrPlugin.scala
@@ -39,7 +39,7 @@ case class CsrPluginConfig(
                             marchid             : BigInt,
                             mimpid              : BigInt,
                             mhartid             : BigInt,
-                            misaExtensionsInit   : Int,
+                            misaExtensionsInit  : Int,
                             misaAccess          : CsrAccess,
                             mtvecAccess         : CsrAccess,
                             mtvecInit           : BigInt,
@@ -68,6 +68,8 @@ case class CsrPluginConfig(
                             satpAccess          : CsrAccess = CsrAccess.NONE,
                             medelegAccess       : CsrAccess = CsrAccess.NONE,
                             midelegAccess       : CsrAccess = CsrAccess.NONE,
+                            withExternalMhartid : Boolean = false,
+                            mhartidWidth        : Int = 0,
                             pipelineCsrRead     : Boolean = false,
                             pipelinedInterrupt  : Boolean = true,
                             csrOhDecoder        : Boolean = true,
@@ -85,12 +87,12 @@ object CsrPluginConfig{
   def small : CsrPluginConfig = small(0x00000020l)
   def smallest : CsrPluginConfig = smallest(0x00000020l)
 
-  def openSbi(hartId : Int, misa : Int) = CsrPluginConfig(
+  def openSbi(misa : Int) = CsrPluginConfig(
     catchIllegalAccess  = true,
     mvendorid           = 0,
     marchid             = 0,
     mimpid              = 0,
-    mhartid             = hartId,
+    mhartid             = 0,
     misaExtensionsInit  = misa,
     misaAccess          = CsrAccess.READ_ONLY,
     mtvecAccess         = CsrAccess.READ_WRITE,   //Could have been WRITE_ONLY :(
@@ -387,6 +389,7 @@ class CsrPlugin(val config: CsrPluginConfig) extends Plugin[VexRiscv] with Excep
   var contextSwitching : Bool = null
   var thirdPartyWake : Bool = null
   var inWfi : Bool = null
+  var externalMhartId : UInt = null
 
   override def askWake(): Unit = thirdPartyWake := True
 
@@ -515,6 +518,8 @@ class CsrPlugin(val config: CsrPluginConfig) extends Plugin[VexRiscv] with Excep
 
 
     pipeline.update(MPP, UInt(2 bits))
+
+    if(withExternalMhartid) externalMhartId = in UInt(mhartidWidth bits)
   }
 
   def inhibateInterrupts() : Unit = allowInterrupts := False
@@ -600,7 +605,8 @@ class CsrPlugin(val config: CsrPluginConfig) extends Plugin[VexRiscv] with Excep
       if(mvendorid != null) READ_ONLY(CSR.MVENDORID, U(mvendorid))
       if(marchid   != null) READ_ONLY(CSR.MARCHID  , U(marchid  ))
       if(mimpid    != null) READ_ONLY(CSR.MIMPID   , U(mimpid   ))
-      if(mhartid   != null) READ_ONLY(CSR.MHARTID  , U(mhartid  ))
+      if(mhartid   != null && !withExternalMhartid) READ_ONLY(CSR.MHARTID  , U(mhartid  ))
+      if(withExternalMhartid) READ_ONLY(CSR.MHARTID  , externalMhartId)
       misaAccess(CSR.MISA, xlen-2 -> misa.base , 0 -> misa.extensions)
 
       //Machine CSR
diff --git a/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala b/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala
index f133616..0b580d8 100644
--- a/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala
+++ b/src/main/scala/vexriscv/plugin/DBusCachedPlugin.scala
@@ -195,6 +195,7 @@ class DBusCachedPlugin(val config : DataCacheConfig,
         rsp.exclusive := RegNext(dBus.rsp.exclusive)
         rsp.error := RegNext(dBus.rsp.error)
         rsp.last := RegNext(dBus.rsp.last)
+        rsp.aggregated := RegNext(dBus.rsp.aggregated)
         rsp.data := RegNextWhen(dBus.rsp.data, dBus.rsp.valid && !cache.io.cpu.writeBack.keepMemRspData)
         rsp
       }