diff --git a/README.md b/README.md
index c885c4d..a915f4a 100644
--- a/README.md
+++ b/README.md
@@ -27,14 +27,14 @@ PicoRV32 is free and open hardware licensed under the [ISC license](http://en.wi
 Features and Typical Applications
 ---------------------------------
 
-- Small (~1000 LUTs in a 7-Series Xilinx FPGA)
-- High fMAX (~250 MHz on 7-Series Xilinx FPGAs)
+- Small (750-1700 LUTs in 7-Series Xilinx Architecture)
+- High f<sub>max</sub> (250-450 MHz on 7-Series Xilinx FPGAs)
 - Selectable native memory interface or AXI4-Lite master
 - Optional IRQ support (using a simple custom ISA)
 - Optional Co-Processor Interface
 
 This CPU is meant to be used as auxiliary processor in FPGA designs and ASICs. Due
-to its high fMAX it can be integrated in most existing designs without crossing
+to its high f<sub>max</sub> it can be integrated in most existing designs without crossing
 clock domains. When operated on a lower frequency, it will have a lot of timing
 slack and thus can be added to a design without compromising timing closure.
 
@@ -251,16 +251,16 @@ The start address of the interrupt handler.
 Cycles per Instruction Performance
 ----------------------------------
 
-*A short reminder: This core is optimized for size, not performance.*
+*A short reminder: This core is optimized for size and f<sub>max</sub>, not performance.*
 
 Unless stated otherwise, the following numbers apply to a PicoRV32 with
 ENABLE_REGS_DUALPORT active and connected to a memory that can accommodate
 requests within one clock cycle.
 
-The average Cycles per Instruction (CPI) is 4 to 5, depending on the mix of
-instructions in the code. The CPI numbers for the individual instructions
-can be found in the table below. The column "CPI (SP)" contains the
-CPI numbers for a core built without ENABLE_REGS_DUALPORT.
+The average Cycles per Instruction (CPI) is approximately 4, depending on the mix of
+instructions in the code. The CPI numbers for the individual instructions can
+be found in the table below. The column "CPI (SP)" contains the CPI numbers for
+a core built without ENABLE_REGS_DUALPORT.
 
 | Instruction          |  CPI | CPI (SP) |
 | ---------------------| ----:| --------:|
@@ -277,9 +277,9 @@ CPI numbers for a core built without ENABLE_REGS_DUALPORT.
 When `ENABLE_MUL` is activated, then a `MUL` instruction will execute
 in 40 cycles and a `MULH[SU|U]` instruction will execute in 72 cycles.
 
-Dhrystone benchmark results: 0.311 DMIPS/MHz (547 Dhrystones/Second/MHz)
+Dhrystone benchmark results: 0.327 DMIPS/MHz (575 Dhrystones/Second/MHz)
 
-For the Dhrystone benchmark the average CPI is 4.144.
+For the Dhrystone benchmark the average CPI is 3.945.
 
 
 PicoRV32 Native Memory Interface
@@ -531,7 +531,7 @@ pure RV32I target, and install it in `/opt/riscv32i`:
 
     git clone https://github.com/riscv/riscv-gnu-toolchain riscv-gnu-toolchain-rv32i
     cd riscv-gnu-toolchain-rv32i
-    git checkout 4bcd4f5
+    git checkout 06c957a
 
     mkdir build; cd build
     ../configure --with-xlen=32 --with-arch=I --prefix=/opt/riscv32i
@@ -541,7 +541,7 @@ The commands will all be named using the prefix `riscv32-unknown-elf-`, which
 makes it easy to install them side-by-side with the regular riscv-tools, which
 are using the name prefix `riscv64-unknown-elf-` by default.
 
-*Note: This instructions are for git rev 4bcd4f5 (2015-12-14) of riscv-gnu-toolchain.*
+*Note: This instructions are for git rev 06c957a (2016-01-20) of riscv-gnu-toolchain.*
 
 
 Evaluation: Timing and Utilization on Xilinx 7-Series FPGAs
diff --git a/firmware/start.S b/firmware/start.S
index 34058aa..a5547b8 100644
--- a/firmware/start.S
+++ b/firmware/start.S
@@ -15,6 +15,11 @@
 #  undef ENABLE_RVTST
 #endif
 
+// Only save registers in IRQ wrapper that are to be saved by the caller in
+// the RISC-V ABI, with the excpetion of the stack pointer. The IRQ handler
+// will save the rest if necessary. I.e. skip x3, x4, x8, x9, and x18-x27.
+#undef ENABLE_FASTIRQ
+
 #include "custom_ops.S"
 
 	.section .text
@@ -58,6 +63,23 @@ irq_vec:
 	getq x2, q3
 	sw x2,   2*4(x1)
 
+#ifdef ENABLE_FASTIRQ
+	sw x5,   5*4(x1)
+	sw x6,   6*4(x1)
+	sw x7,   7*4(x1)
+	sw x10, 10*4(x1)
+	sw x11, 11*4(x1)
+	sw x12, 12*4(x1)
+	sw x13, 13*4(x1)
+	sw x14, 14*4(x1)
+	sw x15, 15*4(x1)
+	sw x16, 16*4(x1)
+	sw x17, 17*4(x1)
+	sw x28, 28*4(x1)
+	sw x29, 29*4(x1)
+	sw x30, 30*4(x1)
+	sw x31, 31*4(x1)
+#else
 	sw x3,   3*4(x1)
 	sw x4,   4*4(x1)
 	sw x5,   5*4(x1)
@@ -87,9 +109,30 @@ irq_vec:
 	sw x29, 29*4(x1)
 	sw x30, 30*4(x1)
 	sw x31, 31*4(x1)
+#endif
 
 #else // ENABLE_QREGS
 
+#ifdef ENABLE_FASTIRQ
+	sw gp,   0*4+0x200(zero)
+	sw x1,   1*4+0x200(zero)
+	sw x2,   2*4+0x200(zero)
+	sw x5,   5*4+0x200(zero)
+	sw x6,   6*4+0x200(zero)
+	sw x7,   7*4+0x200(zero)
+	sw x10, 10*4+0x200(zero)
+	sw x11, 11*4+0x200(zero)
+	sw x12, 12*4+0x200(zero)
+	sw x13, 13*4+0x200(zero)
+	sw x14, 14*4+0x200(zero)
+	sw x15, 15*4+0x200(zero)
+	sw x16, 16*4+0x200(zero)
+	sw x17, 17*4+0x200(zero)
+	sw x28, 28*4+0x200(zero)
+	sw x29, 29*4+0x200(zero)
+	sw x30, 30*4+0x200(zero)
+	sw x31, 31*4+0x200(zero)
+#else
 	sw gp,   0*4+0x200(zero)
 	sw x1,   1*4+0x200(zero)
 	sw x2,   2*4+0x200(zero)
@@ -122,6 +165,7 @@ irq_vec:
 	sw x29, 29*4+0x200(zero)
 	sw x30, 30*4+0x200(zero)
 	sw x31, 31*4+0x200(zero)
+#endif
 
 #endif // ENABLE_QREGS
 
@@ -160,6 +204,23 @@ irq_vec:
 	lw x2,   2*4(x1)
 	setq q2, x2
 
+#ifdef ENABLE_FASTIRQ
+	lw x5,   5*4(x1)
+	lw x6,   6*4(x1)
+	lw x7,   7*4(x1)
+	lw x10, 10*4(x1)
+	lw x11, 11*4(x1)
+	lw x12, 12*4(x1)
+	lw x13, 13*4(x1)
+	lw x14, 14*4(x1)
+	lw x15, 15*4(x1)
+	lw x16, 16*4(x1)
+	lw x17, 17*4(x1)
+	lw x28, 28*4(x1)
+	lw x29, 29*4(x1)
+	lw x30, 30*4(x1)
+	lw x31, 31*4(x1)
+#else
 	lw x3,   3*4(x1)
 	lw x4,   4*4(x1)
 	lw x5,   5*4(x1)
@@ -189,6 +250,7 @@ irq_vec:
 	lw x29, 29*4(x1)
 	lw x30, 30*4(x1)
 	lw x31, 31*4(x1)
+#endif
 
 	getq x1, q1
 	getq x2, q2
@@ -201,6 +263,26 @@ irq_vec:
 	sbreak
 1:
 
+#ifdef ENABLE_FASTIRQ
+	lw gp,   0*4+0x200(zero)
+	lw x1,   1*4+0x200(zero)
+	lw x2,   2*4+0x200(zero)
+	lw x5,   5*4+0x200(zero)
+	lw x6,   6*4+0x200(zero)
+	lw x7,   7*4+0x200(zero)
+	lw x10, 10*4+0x200(zero)
+	lw x11, 11*4+0x200(zero)
+	lw x12, 12*4+0x200(zero)
+	lw x13, 13*4+0x200(zero)
+	lw x14, 14*4+0x200(zero)
+	lw x15, 15*4+0x200(zero)
+	lw x16, 16*4+0x200(zero)
+	lw x17, 17*4+0x200(zero)
+	lw x28, 28*4+0x200(zero)
+	lw x29, 29*4+0x200(zero)
+	lw x30, 30*4+0x200(zero)
+	lw x31, 31*4+0x200(zero)
+#else
 	lw gp,   0*4+0x200(zero)
 	lw x1,   1*4+0x200(zero)
 	lw x2,   2*4+0x200(zero)
@@ -233,6 +315,7 @@ irq_vec:
 	lw x29, 29*4+0x200(zero)
 	lw x30, 30*4+0x200(zero)
 	lw x31, 31*4+0x200(zero)
+#endif
 
 #endif // ENABLE_QREGS
 
diff --git a/picorv32.v b/picorv32.v
index 5e9d539..8a6dbbb 100644
--- a/picorv32.v
+++ b/picorv32.v
@@ -353,7 +353,7 @@ module picorv32 #(
 			0: begin
 				mem_addr <= mem_la_addr;
 				mem_wdata <= mem_la_wdata;
-				mem_wstrb <= mem_la_wstrb;
+				mem_wstrb <= mem_la_wstrb & {4{mem_la_write}};
 				if (mem_do_prefetch || mem_do_rinst) begin
 					current_insn_addr <= next_pc;
 				end
@@ -945,6 +945,7 @@ module picorv32 #(
 			latched_is_lh <= 0;
 			latched_is_lb <= 0;
 			pcpi_valid <= 0;
+			pcpi_timeout <= 0;
 			irq_active <= 0;
 			irq_mask <= ~0;
 			next_irq_pending = 0;