Improvements in picorv32_pcpi_mul

This commit is contained in:
Clifford Wolf 2015-06-28 13:07:50 +02:00
parent 923ac360ff
commit 1f99de5117
4 changed files with 53 additions and 21 deletions

View File

@ -194,6 +194,9 @@ CPI numbers for a core built without ENABLE_REGS_DUALPORT.
| indirect jump (jalr) | 6 | 6 | | indirect jump (jalr) | 6 | 6 |
| shift operations | 4-14 | 4-15 | | shift operations | 4-14 | 4-15 |
When `ENABLE_MUL` is activated, then a `MUL` instruction will execute
in 42 cycles and a `MULH[SU|U]` instruction will execute in 74 cycles.
Dhrystone benchmark results: 0.309 DMIPS/MHz (544 Dhrystones/Second/MHz) Dhrystone benchmark results: 0.309 DMIPS/MHz (544 Dhrystones/Second/MHz)
For the Dhrystone benchmark the average CPI is 4.167. For the Dhrystone benchmark the average CPI is 4.167.

View File

@ -292,6 +292,10 @@ start:
/* break */ /* break */
sbreak sbreak
/* Hard mul functions for multest.c
**********************************/
hard_mul: hard_mul:
mul a0, a0, a1 mul a0, a0, a1
ret ret

View File

@ -660,12 +660,6 @@ module picorv32 #(
reg_pc <= current_pc; reg_pc <= current_pc;
reg_next_pc <= current_pc; reg_next_pc <= current_pc;
if (WITH_PCPI) begin
pcpi_insn_valid <= 0;
pcpi_rs1_valid <= 0;
pcpi_rs2_valid <= 0;
end
latched_store <= 0; latched_store <= 0;
latched_stalu <= 0; latched_stalu <= 0;
latched_branch <= 0; latched_branch <= 0;
@ -726,6 +720,10 @@ module picorv32 #(
reg_sh <= decoded_rs2 ? cpuregs[decoded_rs2] : 0; reg_sh <= decoded_rs2 ? cpuregs[decoded_rs2] : 0;
reg_op2 <= decoded_rs2 ? cpuregs[decoded_rs2] : 0; reg_op2 <= decoded_rs2 ? cpuregs[decoded_rs2] : 0;
if (pcpi_int_ready) begin if (pcpi_int_ready) begin
mem_do_rinst <= 1;
pcpi_insn_valid <= 0;
pcpi_rs1_valid <= 0;
pcpi_rs2_valid <= 0;
reg_out <= pcpi_int_rd; reg_out <= pcpi_int_rd;
latched_store <= pcpi_int_rd_valid; latched_store <= pcpi_int_rd_valid;
cpu_state <= cpu_state_fetch; cpu_state <= cpu_state_fetch;
@ -848,6 +846,10 @@ module picorv32 #(
if (WITH_PCPI && pcpi_insn_valid) begin if (WITH_PCPI && pcpi_insn_valid) begin
pcpi_rs2_valid <= 1; pcpi_rs2_valid <= 1;
if (pcpi_int_ready) begin if (pcpi_int_ready) begin
mem_do_rinst <= 1;
pcpi_insn_valid <= 0;
pcpi_rs1_valid <= 0;
pcpi_rs2_valid <= 0;
reg_out <= pcpi_int_rd; reg_out <= pcpi_int_rd;
latched_store <= pcpi_int_rd_valid; latched_store <= pcpi_int_rd_valid;
cpu_state <= cpu_state_fetch; cpu_state <= cpu_state_fetch;
@ -1023,7 +1025,10 @@ endmodule
* picorv32_pcpi_mul * picorv32_pcpi_mul
***************************************************************/ ***************************************************************/
module picorv32_pcpi_mul ( module picorv32_pcpi_mul #(
// increasing this parameter increases performance and core size
parameter STEPS_AT_ONCE = 1
) (
input clk, resetn, input clk, resetn,
input pcpi_insn_valid, input pcpi_insn_valid,
@ -1067,9 +1072,32 @@ module picorv32_pcpi_mul (
end end
reg [63:0] rs1, rs2, rd, rdx; reg [63:0] rs1, rs2, rd, rdx;
reg [63:0] next_rs1, next_rs2, next_rd, next_rdx, next_rdt;
reg [6:0] mul_counter; reg [6:0] mul_counter;
reg mul_waiting; reg mul_waiting;
reg mul_finish; reg mul_finish;
integer i;
// carry save accumulator
always @* begin
next_rd = rd;
next_rdx = rdx;
next_rs1 = rs1;
next_rs2 = rs2;
for (i = 0; i < STEPS_AT_ONCE; i=i+1) begin
if (next_rs1[0]) begin
next_rdt = (next_rd ^ next_rdx) ^ next_rs2;
next_rdx = ((next_rd & next_rdx) | (next_rd & next_rs2) | (next_rdx & next_rs2)) << 1;
end else begin
next_rdt = next_rd ^ next_rdx;
next_rdx = (next_rd & next_rdx) << 1;
end
next_rd = next_rdt;
next_rs1 = next_rs1 >> 1;
next_rs2 = next_rs2 << 1;
end
end
always @(posedge clk) begin always @(posedge clk) begin
mul_finish <= 0; mul_finish <= 0;
@ -1089,21 +1117,16 @@ module picorv32_pcpi_mul (
rd <= 0; rd <= 0;
rdx <= 0; rdx <= 0;
mul_counter <= instr_any_mulh ? 64 : 32; mul_counter <= (instr_any_mulh ? 63 - STEPS_AT_ONCE : 31 - STEPS_AT_ONCE);
mul_waiting <= !mul_start; mul_waiting <= !mul_start;
end else begin end else begin
// carry save accumulator rd <= next_rd;
if (rs1[0]) begin rdx <= next_rdx;
rd <= rd ^ rdx ^ rs2; rs1 <= next_rs1;
rdx <= ((rd & rdx) | (rd & rs2) | (rdx & rs2)) << 1; rs2 <= next_rs2;
end else begin
rd <= rd ^ rdx; mul_counter <= mul_counter - STEPS_AT_ONCE;
rdx <= (rd & rdx) << 1; if (mul_counter[6]) begin
end
rs1 <= rs1 >> 1;
rs2 <= rs2 << 1;
mul_counter <= mul_counter - 1;
if (!mul_counter) begin
mul_finish <= 1; mul_finish <= 1;
mul_waiting <= 1; mul_waiting <= 1;
end end

View File

@ -244,10 +244,12 @@ module testbench;
$finish; $finish;
end end
integer cycle_counter;
always @(posedge clk) begin always @(posedge clk) begin
cycle_counter <= resetn ? cycle_counter + 1 : 0;
if (resetn && trap) begin if (resetn && trap) begin
repeat (10) @(posedge clk); repeat (10) @(posedge clk);
$display("TRAP"); $display("TRAP after %1d clock cycles", cycle_counter);
$finish; $finish;
end end
end end