Rewrite control_loop_math and simulate

Replace specialized math nodes with single multiplier: each constant must be resized to fit in the multiplier. Simplifies design at the cost of speed.
2022-11-13 18:03:55 -05:00 · 2022-11-13 18:03:55 -05:00 · 50ea679e02
parent 88c42a9f4a
commit 50ea679e02
11 changed files with 364 additions and 480 deletions
--- a/firmware/rtl/control_loop/Makefile
+++ b/firmware/rtl/control_loop/Makefile
@ -1,32 +1,26 @@
 # Makefile for tests and hardware verification.
 .PHONY: test clean
 COMMON_CPP = control_loop_math_implementation.cpp
 COMMON= ${COMMON_CPP} control_loop_math_implementation.h
-obj_dir/Vmul_const.mk: mul_const_sim.cpp mul_const.v boothmul.v intsat.v
+CONSTS_FRAC=43
 E_WID=19
 test: obj_dir/Vcontrol_loop_math
 	obj_dir/Vcontrol_loop_math
 clean:
 	rm -rf obj_dir
 obj_dir/Vcontrol_loop_math.mk: control_loop_math_sim.cpp ${COMMON} \
                               control_loop_math.v
 	verilator --cc --exe -Wall --trace --trace-fst \
-		--top-module mul_const \
+		--top-module control_loop_math \
-		mul_const.v mul_const_sim.cpp
+		-GCONSTS_FRAC=${CONSTS_FRAC} -DDEBUG_CONTROL_LOOP_MATH \
 		-CFLAGS -DCONSTS_FRAC=${CONSTS_FRAC} \
 		-CFLAGS -DE_WID=${E_WID} \
 		control_loop_math.v control_loop_math_sim.cpp ${COMMON_CPP}
-obj_dir/Vmul_const: obj_dir/Vmul_const.mk
+obj_dir/Vcontrol_loop_math: obj_dir/Vcontrol_loop_math.mk
-	cd obj_dir && make -f Vmul_const.mk
+	cd obj_dir && make -f Vcontrol_loop_math.mk
 SEC_PER_CYCLE_WID=15
 CYCLE_COUNT_WID=18
 UNSAT_WID=(${SEC_PER_CYCLE_WID} + ${CYCLE_COUNT_WID})
 MAX_WID=48
 DT_WID=$(shell echo $$((${UNSAT_WID} > ${MAX_WID} ? ${MAX_WID} : ${UNSAT_WID})))
 obj_dir/Vcalculate_dt.mk: calculate_dt_sim.cpp calculate_dt.v ${COMMON}
 	verilator --cc --exe -Wall --trace --trace-fst \
 		--top-module calculate_dt \
 		-GSEC_PER_CYCLE_WID=${SEC_PER_CYCLE_WID} \
 		-GCYCLE_COUNT_WID=${CYCLE_COUNT_WID} \
 		-CFLAGS -DDT_WID=${DT_WID} \
 		calculate_dt.v calculate_dt_sim.cpp ${COMMON_CPP}
 obj_dir/Vcalculate_dt: obj_dir/Vcalculate_dt.mk
 	cd obj_dir && make -f Vcalculate_dt.mk
 test: obj_dir/Vcalculate_dt
 	obj_dir/Vcalculate_dt
--- a/firmware/rtl/control_loop/boothmul.v
+++ b/firmware/rtl/control_loop/boothmul.v
@ -142,3 +142,4 @@ end
 `endif
 endmodule
 `undefineall
--- a/firmware/rtl/control_loop/calculate_dt.v
+++ b/firmware/rtl/control_loop/calculate_dt.v
@ -1,63 +0,0 @@
 /* Calculate and truncate Δt = cycles/100MhZ.
 * NOTE: boothmul is a SIGNED algorithm so both inputs are SIGNED.
 * This means that SEC_PER_CYCLE must have a leading 0
 * and that cycles must also have a leading zero.
 */
 `undefineall
 module calculate_dt #(
 	/* This number is 1/(clock cycle).
 	   The number is interpreted so the least significant bit
 	   coincides with the LSB of a constant. */
 	parameter SEC_PER_CYCLE_WID = 15,
 	parameter [SEC_PER_CYCLE_WID-1:0] SEC_PER_CYCLE = 'b010101011110011,
 	parameter CYCLE_COUNT_WID = 18,
 	parameter MAX_WID = 48
 ) (
 	input clk,
 	input arm,
 	output finished,
 	input [CYCLE_COUNT_WID-1:0] cycles,
 /* Multiplication of Q18.0 and 14 lower siginifcant bits. */
 `define DT_WID_UNTRUNC (SEC_PER_CYCLE_WID + CYCLE_COUNT_WID)
 `define DT_WID (`DT_WID_UNTRUNC > MAX_WID ? MAX_WID : `DT_WID_UNTRUNC)
 	output [`DT_WID-1:0] dt
 );
 wire [`DT_WID_UNTRUNC-1:0] dt_untrunc;
 boothmul #(
 	.A1_LEN(CYCLE_COUNT_WID),
 	.A2_LEN(SEC_PER_CYCLE_WID)
 ) mul (
 	.clk(clk),
 	.arm(arm),
 	.a1(cycles),
 	.a2(SEC_PER_CYCLE),
 	.outn(dt_untrunc),
 	.fin(finished)
 );
 generate if (`DT_WID_UNTRUNC > `DT_WID) begin
 	intsat #(
 		.IN_LEN(`DT_WID_UNTRUNC),
 		.LTRUNC(`DT_WID_UNTRUNC - `DT_WID)
 	) sat (
 		.inp(dt_untrunc),
 		.outp(dt)
 	);
 end else begin
 	assign dt = dt_untrunc;
 end endgenerate
 `ifdef VERILATOR
 initial begin
 	$dumpfile("calculate_dt.fst");
 	$dumpvars;
 end
 `endif
 endmodule
--- a/firmware/rtl/control_loop/calculate_dt_sim.cpp
+++ b/firmware/rtl/control_loop/calculate_dt_sim.cpp
@ -1,62 +0,0 @@
 #include <cstdio>
 #include "control_loop_math_implementation.h"
 #include <verilated.h>
 #include "Vcalculate_dt.h"
 using ModType = Vcalculate_dt;
 uint32_t main_time = 0;
 double sc_time_stamp() {
 	return main_time;
 }
 ModType *mod;
 static void run_clock() {
 	for (int i = 0; i < 2; i++) {
 		mod->clk = !mod->clk;
 		mod->eval();
 		main_time++;
 	}
 }
 static void init(int argc, char **argv) {
 	Verilated::commandArgs(argc, argv);
 	Verilated::traceEverOn(true);
 	mod = new ModType;
 	mod->clk = 0;
 }
 int main(int argc, char **argv) {
 	int r = 0;
 	init(argc, argv);
 	for (V i = 1; i < ((1 << 17) - 1); i++) {
 		mod->cycles = i;
 		mod->arm = 1;
 		do { run_clock(); } while (!mod->finished);
 		mod->arm = 0;
 		V real_dt = calculate_dt(i, DT_WID);
 		if (mod->dt != real_dt) {
 			printf("(%lld) %lld != %lld\n", i, mod->dt, real_dt);
 			r = 1;
 			goto end;
 		}
 		struct fixed_point fxp = {
 			.val = real_dt,
 			.whole_len = 0,
 			.frac_len = 40
 		};
 		printf("%s\n", fxp_to_string(fxp).c_str());
 		run_clock();
 	}
 end:
 	mod->final();
 	delete mod;
 	return r;
 }
--- a/firmware/rtl/control_loop/control_loop_math.v
+++ b/firmware/rtl/control_loop/control_loop_math.v
@ -1,4 +1,3 @@
 `undefineall
 /*************** Precision **************
 * The control loop is designed around these values, but generally
 * does not hardcode them.
@ -20,225 +19,216 @@
 * Δt is cycles/100MHz. This makes Δt at least 10 ns, with a
 * maximum of 1 ms.
 *
- * [1 : sign][7: whole][40: fractional]
+ * [1 : sign][20: whole][43: fractional]
 * This is 127 to -128, with a resolution of 9.095e-13.
 */
 module control_loop_math #(
-	parameter CONSTS_WHOLE = 8,
+	parameter CONSTS_WHOLE = 21,
-	parameter CONSTS_FRAC = 40,
+	parameter CONSTS_FRAC = 43,
 `define CONSTS_WID (CONSTS_WHOLE + CONSTS_FRAC)
 	parameter CONSTS_SIZ=7,
 	parameter DAC_DATA_WID = 20,
 	parameter ADC_WID = 18,
 	parameter [`CONSTS_WID-1:0] SEC_PER_CYCLE = 'b10101011110011000,
 	parameter CYCLE_COUNT_WID = 18
 `define E_WID (ADC_WID + 1)
 	/* How large the intermediate value should be. This should hold the ADC
 	 * value /as an integer/ along with the P, I values.
 	 * The OUT_FRAC value should usually but not always be the same as CONSTS_FRAC.
 	 */
 	parameter OUT_WHOLE = 20,
 	parameter OUT_FRAC = 40
 `define OUT_WID (OUT_WHOLE + OUT_FRAC)
 ) (
 	input clk,
 	input arm,
 	output reg finished,
-	input [ADC_WID-1:0] setpt,
+	input signed [ADC_WID-1:0] setpt,
-	input [ADC_WID-1:0] measured,
+	input signed [ADC_WID-1:0] measured,
-	input [`CONSTS_WID-1:0] cl_P,
+	input signed [`CONSTS_WID-1:0] cl_P,
-	input [`CONSTS_WID-1:0] cl_I,
+	input signed [`CONSTS_WID-1:0] cl_I,
-	input [CYCLE_COUNT_WID-1:0] cycles,
+	input signed [CYCLE_COUNT_WID-1:0] cycles,
-	input [`ERR_WID-1:0] e_prev,
+	input signed [`E_WID-1:0] e_prev,
-	input [`OUT_WID-1:0] adjval_prev,
+	input signed [`CONSTS_WID-1:0] adjval_prev,
-	output reg [`ERR_WID-1:0] e_cur,
+`ifdef DEBUG_CONTROL_LOOP_MATH
-	output [`OUT_WID-1:0] adj_val
+	output reg [`CONSTS_WID-1:0] dt_reg,
 	output reg [`CONSTS_WID-1:0] idt_reg,
 	output reg [`CONSTS_WID-1:0] epidt_reg,
 	output reg [`CONSTS_WID-1:0] ep_reg,
 `endif
 	output reg signed [`E_WID-1:0] e_cur,
 	output signed [`CONSTS_WID-1:0] adj_val
 );
-/* Calculate current error */
+/*******
-assign e_cur = setpt - measured;
+ * Multiplier segment.
-
+ * Multiplies two 64 bit numbers and right-saturate + truncates it
-/**** Stage 1: calculate Δt = cycles/100MHz
+ * to be a 64 bit output, according to fixed-point rules.
 *    cycles: CYCLE_COUNT_WID.0
 *    SEC_PER_CYCLE: 0....SEC_PER_CYCLE_WID
 * -x--------------------------------
 *    dt_unsat: CYCLE_COUNT_WID + SEC_PER_CYCLE_WID
 *
 * Optimization note: the total width can be capped to below 1.
 */
-reg arm_stage_1 = 0;
+reg signed [`CONSTS_WID-1:0] a1;
-
+reg signed [`CONSTS_WID-1:0] a2;
-`define DT_UNSAT_WID (CYCLE_COUNT_WID + SEC_PER_CYCLE_WID)
+/* verilator lint_off UNUSED */
-wire [`DT_UNSAT_WID-1:0] dt_unsat;
+wire signed [`CONSTS_WID+`CONSTS_WID-1:0] out_untrunc;
-wire mul_dt_fin;
+wire mul_fin;
 reg mul_arm;
 boothmul #(
-	.A1_LEN(CYCLE_COUNT_WID),
+	.A1_LEN(`CONSTS_WID),
-	.A2_LEN(SEC_PER_CYCLE_WID)
+	.A2_LEN(`CONSTS_WID),
-) mul_dt (
+	.A2LEN_SIZ(CONSTS_SIZ)
 ) multiplier (
 	.a1(a1),
 	.a2(a2),
 	.clk(clk),
-	.arm(arm_stage_1),
+	.outn(out_untrunc),
-	.a1(cycles),
+	.fin(mul_fin),
-	.a2(SEC_PER_CYCLE),
+	.arm(mul_arm)
 	.outn(dt_unsat),
 	.fin(mul_dt_fin)
 );
-`define DT_WID (`DT_UNSAT_WID > `CONSTS_WID ? `CONSTS_WID : `DT_UNSAT_WID)
+/****************************
-wire [`DT_WID-1:0] dt;
+ * QX.Y * QX.Y = Q(2X).(2Y)
-
+ * This right-truncation gets rid of the lowest Y bits.
-`define DT_WHOLE (`DT_WID < `CONSTS_FRAC ? 0 : `CONSTS_FRAC - `DT_WID)
+ * Q(2X).Y
 `define DT_FRAC (`DT_WID - `DT_WHOLE)
 generate if (`DT_UNSAT_WID > `CONSTS_WID) begin
 	intsat #(
 		.IN_LEN(`DT_UNSAT_WID),
 		.LTRUNC(`DT_UNSAT_WID - `CONSTS_WID)
 	) insat_dt (
 		.inp(dt_unsat),
 		.outp(dt)
 	);
 end else begin
 	assign dt = dt_unsat;
 end endgenerate
 /**** Stage 2: Calculate P + IΔt
 *     I: CONSTS_WHOLE.CONSTS_FRAC
 *  x  dt: DT_WHOLE.DT_FRAC
 *-- -------------------------------
 *     Idt_unscaled:
 *-- --------------------------------
 *     Idt: CONSTS_WHOLE.CONSTS_FRAC
 *
 * Right-truncate DT_FRAC bits to ensure CONSTS_FRAC
 * Integer-sature the DT_WHOLE bits if it extends far enough
 */
-wire stage2_finished;
+`define OUT_RTRUNC_WID (`CONSTS_WID+`CONSTS_WID-CONSTS_FRAC)
-reg arm_stage2 = 0;
+wire signed [`OUT_RTRUNC_WID-1:0] out_rtrunc
-wire [`CONSTS_WID-1:0] idt;
+	= out_untrunc[`CONSTS_WID+`CONSTS_WID-1:CONSTS_FRAC];
-mul_const #(
+wire signed [`CONSTS_WID-1:0] mul_out;
 	.CONSTS_WHOLE(CONSTS_WHOLE),
 	.CONSTS_FRAC(CONSTS_FRAC),
 	.IN_WHOLE(`DT_WHOLE),
 	.IN_FRAC(`DT_FRAC),
 	.OUT_WHOLE(CONSTS_WHOLE),
 	.OUT_FRAC(CONSTS_FRAC)
 ) mul_const_idt (
 	.clk(clk),
 	.inp(dt),
 	.const_in(cl_I),
 	.arm(arm_stage2),
 	.outp(idt),
 	.finished(stage2_finished)
 );
-reg [`CONSTS_WID:0] pidt_untrunc;
+/***************************
-/* Assuming that the constraints on cl_P, I, and dt hold */
+ * Saturate higher X bits away.
-wire [`CONSTS_WID-1:0] pidt = pidt_untrunc[`CONSTS_WID-1:0];
+ * Q(2X).Y -> QX.Y
-
+ */
 /**** Stage 3: calculate e_t(P + IΔt) and P e_{t-1} ****/
 reg arm_stage3 = 0;
 wire epidt_finished;
 wire pe_finished;
 wire [`OUT_WID-1:0] epidt;
 mul_const #(
 	.CONSTS_WHOLE(`CONSTS_WHOLE),
 	.CONSTS_FRAC(`CONSTS_FRAC),
 	.IN_WHOLE(`E_WID),
 	.IN_FRAC(0),
 	.OUT_WHOLE(OUT_WHOLE),
 	.OUT_FRAC(OUT_FRAC)
 ) mul_const_epidt (
 	.clk(clk),
 	.inp(e_cur),
 	.const_in(idt),
 	.arm(arm_stage3),
 	.outp(epidt),
 	.finished(epidt_finished)
 );
 wire [`OUT_WID-1:0] pe;
 mul_const #(
 	.CONSTS_WHOLE(`CONSTS_WHOLE),
 	.CONSTS_FRAC(`CONSTS_FRAC),
 	.IN_WHOLE(`ERR_WID),
 	.IN_FRAC(0),
 	.OUT_WHOLE(OUT_WHOLE),
 	.OUT_FRAC(OUT_FRAC)
 ) mul_const_pe (
 	.clk(clk),
 	.inp(e_prev),
 	.const_in(idt),
 	.arm(arm_stage3),
 	.outp(pe),
 	.finished(epidt_finished)
 );
 reg [`OUT_WID+1:0] adj_val_utrunc;
 /* = prev_adj + epidt - pe; */
 intsat #(
-	.IN_LEN(`OUT_WID + 2),
+	.IN_LEN(`OUT_RTRUNC_WID),
-	.LTRUNC(2)
+	.LTRUNC(CONSTS_WHOLE)
-) adj_val_sat (
+) multiplier_saturate (
-	.inp(adj_val_utrunc),
+	.inp(out_rtrunc),
-	.outp(adj_val)
+	.outp(mul_out)
 );
-/******* State machine ********/
+/*************************
-localparam WAIT_ON_ARM = 0;
+ * Safely get rid of high bit in addition.
-localparam WAIT_ON_STAGE_1 = 1;
+ ************************/
 localparam WAIT_ON_STAGE_2 = 2;
 localparam WAIT_ON_STAGE_3 = 3;
 localparam WAIT_ON_DISARM = 4;
-localparam STATE_SIZ = 3;
+reg signed [`CONSTS_WID+1-1:0] add_sat;
-reg [STATE_SIZ-1:0] state = WAIT_ON_ARM;
+wire signed [`CONSTS_WID-1:0] saturated_add;
 intsat #(
 	.IN_LEN(`CONSTS_WID + 1),
 	.LTRUNC(1)
 ) addition_saturate (
 	.inp(add_sat),
 	.outp(saturated_add)
 );
 localparam WAIT_ON_ARM = 0;
 localparam WAIT_ON_CALCULATE_DT = 1;
 localparam CALCULATE_IDT = 2;
 localparam CALCULATE_EPIDT = 3;
 localparam CALCULATE_EP = 4;
 localparam CALCULATE_A_PART_1 = 5;
 localparam CALCULATE_A_PART_2 = 6;
 localparam WAIT_ON_DISARM = 7;
 reg [4:0] state = WAIT_ON_ARM;
 reg signed [`CONSTS_WID+1-1:0] tmpstore = 0;
 wire signed [`CONSTS_WID-1:0] tmpstore_view = tmpstore[`CONSTS_WID-1:0];
 always @ (posedge clk) begin
-	case (state) begin
+	case (state)
-	WAIT_ON_ARM: begin
+	WAIT_ON_ARM:
 		if (arm) begin
-			arm_stage_1 <= 1;
+			e_cur <= setpt - measured;
-			state <= WAIT_ON_STAGE_1;
+
 			a1 <= SEC_PER_CYCLE;
 			/* No sign extension, cycles is positive */
 			a2 <= {{(CONSTS_WHOLE - CYCLE_COUNT_WID){1'b0}}, cycles, {(CONSTS_FRAC){1'b0}}};
 			mul_arm <= 1;
 			state <= WAIT_ON_CALCULATE_DT;
 		end else begin
 			finished <= 0;
 		end
 	WAIT_ON_CALCULATE_DT:
 		if (mul_fin) begin
 			mul_arm <= 0;
 			`ifdef DEBUG_CONTROL_LOOP_MATH
 				dt_reg <= mul_out;
 			`endif
 			a1 <= mul_out; /* a1 = Δt */
 			a2 <= cl_I;
 			state <= CALCULATE_IDT;
 		end
 	CALCULATE_IDT:
 		if (!mul_arm) begin
 			mul_arm <= 1;
 		end else if (mul_fin) begin
 			mul_arm <= 0;
 			add_sat <= (mul_out + cl_P);
 			`ifdef DEBUG_CONTROL_LOOP_MATH
 				idt_reg <= mul_out;
 			`endif
 			a2 <= {{(CONSTS_WHOLE-`E_WID){e_cur[`E_WID-1]}},e_cur, {(CONSTS_FRAC){1'b0}}};
 			state <= CALCULATE_EPIDT;
 		end
 	CALCULATE_EPIDT:
 		if (!mul_arm) begin
 			a1 <= saturated_add;
 			mul_arm <= 1;
 		end else if (mul_fin) begin
 			mul_arm <= 0;
 			tmpstore <= {mul_out[`CONSTS_WID-1],mul_out};
 			`ifdef DEBUG_CONTROL_LOOP_MATH
 				epidt_reg <= mul_out;
 			`endif
 			a1 <= cl_P;
 			a2 <= {{(CONSTS_WHOLE-`E_WID){e_prev[`E_WID-1]}},e_prev, {(CONSTS_FRAC){1'b0}}};
 			state <= CALCULATE_EP;
 		end
 	CALCULATE_EP:
 		if (!mul_arm) begin
 			mul_arm <= 1;
 		end else if (mul_fin) begin
 			`ifdef DEBUG_CONTROL_LOOP_MATH
 				ep_reg <= mul_out;
 			`endif
 			mul_arm <= 0;
 			add_sat <= (tmpstore_view - mul_out);
 			state <= CALCULATE_A_PART_1;
 		end
 	CALCULATE_A_PART_1: begin
 		tmpstore <= saturated_add + adjval_prev;
 		state <= CALCULATE_A_PART_2;
 	end
-	WAIT_ON_STAGE_1: begin
+	CALCULATE_A_PART_2: begin
-		if (mul_scale_err_fin && mul_dt_fin) begin
+		add_sat <= tmpstore;
-			arm_stage_1 <= 0;
+		state <= WAIT_ON_DISARM;
 			arm_stage_2 <= 1;
 			state <= WAIT_ON_STAGE_2;
 		end
 	end
 	WAIT_ON_STAGE_2: begin
 		if (stage2_finished) begin
 			pidt_untrunc <= cl_P + idt;
 			arm_stage_2 <= 0;
 			arm_stage_3 <= 1;
 			state <= WAIT_ON_STAGE_3;
 		end
 	end
 	WAIT_ON_STAGE_3: begin
 		if (epidt_finished && pe_finished) begin
 			adj_val_utrunc <= prev_adj + epidt - pe;
 			arm_stage3 <= 0;
 			finished <= 1;
 			state <= WAIT_ON_DISARM;
 		end
 	end
 	WAIT_ON_DISARM: begin
 		adj_val <= saturated_add;
 		if (!arm) begin
 			finished <= 0;
 			state <= WAIT_ON_ARM;
 			finished <= 0;
 		end else begin
 			finished <= 1;
 		end
 	end
 	endcase
 end
 `ifdef VERILATOR
 initial begin
 	$dumpfile("control_loop_math.fst");
 	$dumpvars;
 end
 `endif
 endmodule
 `undefineall
--- a/firmware/rtl/control_loop/control_loop_math_implementation.cpp
+++ b/firmware/rtl/control_loop/control_loop_math_implementation.cpp
@ -2,7 +2,14 @@
 #define BITMASK(n) (((V)1 << (n)) - 1)
-static V sat(V r, unsigned siz) {
+/* only works on 64 bit GCC/Clang, can use boost (eww boost) */
 static V sat(__int128_t r, unsigned siz, unsigned discard) {
 	r >>= discard;
 	/* Since this is signed numbers, the actual number of bits of
 	 * the largest number is one less than the bit size. */
 	siz -= 1;
 	if (r >= BITMASK(siz)) {
 		return BITMASK(siz);
 	} else if (r <= -BITMASK(siz)) {
@ -14,15 +21,14 @@ static V sat(V r, unsigned siz) {
 	}
 }
 V mulsat(V x, V y, unsigned siz, unsigned discard) {
 	__int128_t v = (__int128_t)x * (__int128_t)y;
-V calculate_dt(V cycles, unsigned siz) {
+	return sat(v, siz, discard);
 	constexpr V sec_per_cycle = 0b10101011110011;
 	return sat(sec_per_cycle * cycles, siz);
 }
-static char d2c(int c) {
+static int d2c(unsigned d) {
-	switch (c % 10) {
+	switch (d) {
 	case 0: return '0';
 	case 1: return '1';
 	case 2: return '2';
@ -36,7 +42,6 @@ static char d2c(int c) {
 	default: return '?';
 	}
 }
 std::string fxp_to_string(const struct fixed_point &fxp) {
 	std::string r = std::to_string((fxp.val >> fxp.frac_len) & BITMASK(fxp.whole_len));
 	V frac = fxp.val & BITMASK(fxp.frac_len);
@ -51,3 +56,27 @@ std::string fxp_to_string(const struct fixed_point &fxp) {
 	return r;
 }
 #if 0
 V asr (V x, unsigned len) {
 	if (x >= 0)
 		return x >> len;
 	x >>= len;
 	/* x is shifted-right by N bits. This makes a mask of
 	 * N bits, and shifts it to the highest position.
 	 */
 	V mask = ((1 << len) - 1) << (sizeof(x) * CHAR_BITS - len);
 	return mask | x;
 }
 #endif
 V sign_extend(V x, unsigned len) {
 	/* if high bit is 1 */
 	if (x >> (len - 1) & 1) {
 		V mask = (1 << len) - 1;
 		return ~mask | x;
 	} else {
 		return x;
 	}
 }
--- a/firmware/rtl/control_loop/control_loop_math_implementation.h
+++ b/firmware/rtl/control_loop/control_loop_math_implementation.h
@ -2,16 +2,46 @@
 #include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 #include <limits>
 #include <random>
 using V = int64_t;
 constexpr V V_min = std::numeric_limits<V>::min();
 class Transfer {
 	std::default_random_engine generator;
 	std::normal_distribution<> dist;
 	double scale;
 	double m;
 	double b;
 	double sample() {return scale*dist(generator);}
 	public:
 	Transfer(double scale, double mean, double dev, double m, double b, int seed)
 	: scale{scale}, dist{mean,dev}, generator{}, m{m}, b{b} {
 		if (seed < 0) {
 			std::random_device rd;
 			generator.seed(rd());
 		} else {
 			generator.seed(seed);
 		}
 	}
 	int64_t val(double x) {
 		return m*x + b + sample();
 	}
 };
 V mulsat(V x, V y, unsigned siz, unsigned discard);
 struct fixed_point {
 	V val;
 	unsigned whole_len;
 	unsigned frac_len;
 };
 V calculate_dt(V cycles, unsigned siz);
 std::string fxp_to_string(const struct fixed_point &fxp);
 // V asr(V x, unsigned len);
 V sign_extend(V x, unsigned len);
--- a/firmware/rtl/control_loop/control_loop_math_sim.cpp
+++ b/firmware/rtl/control_loop/control_loop_math_sim.cpp
@ -0,0 +1,108 @@
 #include <cstdio>
 #include <cstdint>
 #include "control_loop_math_implementation.h"
 #include "Vcontrol_loop_math.h"
 using ModType = Vcontrol_loop_math;
 uint32_t main_time = 0;
 double sc_time_stamp() {
 	return main_time;
 }
 ModType *mod;
 static void run_clock() {
 	for (int i = 0; i < 2; i++) {
 		mod->clk = !mod->clk;
 		mod->eval();
 		main_time++;
 	}
 }
 static void init(int argc, char **argv) {
 	Verilated::commandArgs(argc, argv);
 	Verilated::traceEverOn(true);
 	mod = new ModType;
 	mod->clk = 0;
 }
 #define MASK(n) ((1 << (n)) - 1)
 using V = int64_t;
 constexpr V per100 = 0b010101011110011000;
 static void calculate() {
 	/* Multiplication adds an extra CONSTS_FRAC bits to the end,
 	 * truncate them. */
 	V err_cur = (V)mod->setpt - (V)mod->measured;
 	V dt = mulsat(per100, (V)mod->cycles << CONSTS_FRAC, 64, CONSTS_FRAC);
 	V idt = mulsat(dt, mod->cl_I, 64, CONSTS_FRAC);
 	V epidt = mulsat(err_cur << CONSTS_FRAC, mod->cl_P + idt, 64, CONSTS_FRAC);
 	V ep = mulsat((V)mod->e_prev << CONSTS_FRAC, mod->cl_P, 64, CONSTS_FRAC);
 	V new_adjval = mod->adjval_prev + epidt - ep;
 	mod->arm = 1;
 	do {
 		run_clock();
 	} while (!mod->finished);
 	mod->arm = 0;
 	run_clock();
 	run_clock();
 #if 0
 	/* Stupid bug: verilator does not sign-extend signed ports */
 	printf("err_cur %ld %ld\n", err_cur, sign_extend(mod->e_cur, E_WID));
 	printf("dt %ld %ld\n", dt, mod->dt_reg);
 	printf("idt %ld %ld\n", idt, mod->idt_reg);
 	printf("epidt %ld %ld\n", epidt, mod->epidt_reg);
 	printf("ep %ld %ld\n", ep, mod->ep_reg);
 	printf("adj %ld %ld\n", new_adjval, mod->adj_val);
 #endif
 }
 int main(int argc, char **argv) {
 	init(argc, argv);
 	mod->arm = 0;
 	run_clock();
 	Transfer func = Transfer{150, 0, 2, 1.1, 10, -1};
 	/* Initial conditions */
 	mod->setpt = 10000;
 	mod->cl_P = 0b11010111000010100011110101110000101000111; /* 0.21 */
 	mod->cl_I = (V)12 << CONSTS_FRAC;
 	mod->cycles = 20; /* dummy number for now */
 	mod->e_prev = 0;
 	mod->adjval_prev = 0;
 	V setting = 100000;
 	printf("running\n");
 	for (int i = 0; i < 200; i++) {
 		mod->measured = func.val(setting);
 		calculate();
 		mod->e_prev = mod->e_cur;
 		mod->adjval_prev = mod->adj_val;
 		/* C++ has no standard arithmetic right shift */
 		V adj;
 		if ((V)mod->adj_val > 0) {
 			adj = mod->adj_val >> CONSTS_FRAC;
 		} else {
 			adj = -((-mod->adj_val) >> CONSTS_FRAC);
 		}
 		printf("#%d: setting: %ld, measured: %ld, setpt: %ld, adj: %ld\n", i, setting, mod->measured, mod->setpt, adj);
 		setting += adj;
 	}
 	mod->final();
 	delete mod;
 	return 0;
 }
--- a/firmware/rtl/control_loop/mul_const.v
+++ b/firmware/rtl/control_loop/mul_const.v
@ -1,65 +0,0 @@
 module mul_const #(
 	parameter CONSTS_WHOLE = 8,
 	parameter CONSTS_FRAC = 40,
 `define CONSTS_WID (CONSTS_WHOLE + CONSTS_FRAC)
 	parameter IN_WHOLE = CONSTS_WHOLE,
 	parameter IN_FRAC = CONSTS_FRAC,
 `define IN_WID (IN_WHOLE + IN_FRAC)
 	parameter OUT_WHOLE = 20,
 	parameter OUT_FRAC = 40
 `define OUT_WID (OUT_WHOLE + OUT_FRAC)
 ) (
 	input clk,
 	input signed [`IN_WID-1:0] inp,
 	input signed [`CONSTS_WID-1:0] const_in,
 	input arm,
 	output signed [`OUT_WID-1:0] outp,
 	output finished
 );
 `define UNSAT_WID (`CONSTS_WID + `IN_WID)
 wire signed [`UNSAT_WID-1:0] unsat;
 boothmul #(
 	.A1_LEN(`CONSTS_WID),
 	.A2_LEN(`IN_WID)
 ) mul (
 	.clk(clk),
 	.arm(arm),
 	.a1(const_in),
 	.a2(inp),
 	.outn(unsat),
 	.fin(finished)
 );
 `define RIGHTTRUNC_WID (CONSTS_WHOLE + IN_WHOLE + OUT_FRAC)
 `define UNSAT_FRAC (CONSTS_FRAC + IN_FRAC)
 wire signed [`RIGHTTRUNC_WID-1:0] rtrunc =
 	unsat[`UNSAT_WID-1:(`UNSAT_FRAC - OUT_FRAC)];
 generate if (OUT_WHOLE < CONSTS_WHOLE + IN_WHOLE) begin
 	intsat #(
 		.IN_LEN(`RIGHTTRUNC_WID),
 		.LTRUNC(CONSTS_WHOLE + IN_WHOLE - OUT_WHOLE)
 	) sat (
 		.inp(rtrunc),
 		.outp(outp)
 	);
 end else if (OUT_WHOLE == CONSTS_WHOLE + IN_WHOLE) begin
 	assign outp = rtrunc;
 end else begin
 	assign outp[`RIGHTTRUNC_WID-1:0] = rtrunc;
 	assign outp[`OUT_WID-1:`RIGHTTRUNC_WID] = {
 		(`OUT_WID-`RIGHTTRUNC_WID){rtrunc[`RIGHTTRUNC_WID-1]}
 	};
 end endgenerate
 `ifdef VERILATOR
 initial begin
 	$dumpfile("mul_const.fst");
 	$dumpvars();
 end
 `endif
 endmodule
--- a/firmware/rtl/control_loop/mul_const_sim.cpp
+++ b/firmware/rtl/control_loop/mul_const_sim.cpp
@ -1,78 +0,0 @@
 #include <cstdio>
 #include <cstdint>
 #include "Vmul_const.h"
 using ModType = Vmul_const;
 uint32_t main_time = 0;
 double sc_time_stamp() {
 	return main_time;
 }
 ModType *mod;
 static void run_clock() {
 	for (i = 0; i < 2; i++) {
 		mod->clk = !mod->clk;
 		mod->eval();
 		main_time++;
 	}
 }
 static void init(int argc, char **argv) {
 	Verilator::commandArgs(argc, argv);
 	Verilated::traceEverOn(true);
 	mod = new ModType;
 	mod->clk = 0;
 }
 #define BITMASK(n) ((1 << (n)) - 1)
 static void satmul(int64_t const_in, int64_t inp) {
 	int64_t r = const_in * inp;
 	if (r >= BITMASK(48)) {
 		return BITMASK(48);
 	} else if (r <= -BITMASK(48)) {
 		V allzero = ~((V) 0);
 		// make (siz - 1) zero bits
 		return allzero & (allzero << (siz - 1));
 	} else {
 		return r; 
 	}
 }
 #define RUNS 10000
 static void run(uint64_t const_in, uint64_t inp) {
 	const_in &= BITMASK(48);
 	inp &= BITMASK(IN_WID);
 	mod->inp = inp;
 	mod->const_in = const_in;
 	mod->arm = 1;
 	while (!mod->finished)
 		run_clock();
 	mod->finished = 0;
 	run_clock();
 	int64_t real_result = satmul(const_in, inp);
 	if (real_result != outp) {
 		printf("%llX * %llX = %llX (got %llX)\n",
 			std::reinterpret_cast<uint64_t>(const_in),
 			std::reinterpret_cast<uint64_t>(inp),
 			std::reinterpret_cast<uint64_t>(real_result),
 			std::reinterpret-cast<uint64_t>(outp));
 		exit(1);
 	}
 }
 int main(int argc, char **argv) {
 	run_clock();
 	for (int i = 0; i < RUNS; i++) {
 		run(rand() - rand(), rand() - rand());
 	}
 	return 0;
 }
--- a/firmware/rtl/control_loop/sign_extend.v
+++ b/firmware/rtl/control_loop/sign_extend.v