From 50ea679e020a9fa0c169f34fcceb6ca13092aeaf Mon Sep 17 00:00:00 2001
From: Peter McGoron <code@mcgoron.com>
Date: Sun, 13 Nov 2022 18:03:55 -0500
Subject: [PATCH] Rewrite control_loop_math and simulate

Replace specialized math nodes with single multiplier: each constant
must be resized to fit in the multiplier. Simplifies design at the
cost of speed.
---
 firmware/rtl/control_loop/Makefile            |  42 +--
 firmware/rtl/control_loop/boothmul.v          |   1 +
 firmware/rtl/control_loop/calculate_dt.v      |  63 ----
 .../rtl/control_loop/calculate_dt_sim.cpp     |  62 ----
 firmware/rtl/control_loop/control_loop_math.v | 346 +++++++++---------
 .../control_loop_math_implementation.cpp      |  47 ++-
 .../control_loop_math_implementation.h        |  32 +-
 .../control_loop/control_loop_math_sim.cpp    | 108 ++++++
 firmware/rtl/control_loop/mul_const.v         |  65 ----
 firmware/rtl/control_loop/mul_const_sim.cpp   |  78 ----
 firmware/rtl/{ => control_loop}/sign_extend.v |   0
 11 files changed, 364 insertions(+), 480 deletions(-)
 delete mode 100644 firmware/rtl/control_loop/calculate_dt.v
 delete mode 100644 firmware/rtl/control_loop/calculate_dt_sim.cpp
 create mode 100644 firmware/rtl/control_loop/control_loop_math_sim.cpp
 delete mode 100644 firmware/rtl/control_loop/mul_const.v
 delete mode 100644 firmware/rtl/control_loop/mul_const_sim.cpp
 rename firmware/rtl/{ => control_loop}/sign_extend.v (100%)

diff --git a/firmware/rtl/control_loop/Makefile b/firmware/rtl/control_loop/Makefile
index 7b8f099..e0c3d60 100644
--- a/firmware/rtl/control_loop/Makefile
+++ b/firmware/rtl/control_loop/Makefile
@@ -1,32 +1,26 @@
 # Makefile for tests and hardware verification.
 
+.PHONY: test clean
 COMMON_CPP = control_loop_math_implementation.cpp
-
 COMMON= ${COMMON_CPP} control_loop_math_implementation.h
 
-obj_dir/Vmul_const.mk: mul_const_sim.cpp mul_const.v boothmul.v intsat.v
+CONSTS_FRAC=43
+E_WID=19
+
+test: obj_dir/Vcontrol_loop_math
+	obj_dir/Vcontrol_loop_math
+clean:
+	rm -rf obj_dir
+
+obj_dir/Vcontrol_loop_math.mk: control_loop_math_sim.cpp ${COMMON} \
+                               control_loop_math.v
 	verilator --cc --exe -Wall --trace --trace-fst \
-		--top-module mul_const \
-		mul_const.v mul_const_sim.cpp
+		--top-module control_loop_math \
+		-GCONSTS_FRAC=${CONSTS_FRAC} -DDEBUG_CONTROL_LOOP_MATH \
+		-CFLAGS -DCONSTS_FRAC=${CONSTS_FRAC} \
+		-CFLAGS -DE_WID=${E_WID} \
+		control_loop_math.v control_loop_math_sim.cpp ${COMMON_CPP}
 
-obj_dir/Vmul_const: obj_dir/Vmul_const.mk
-	cd obj_dir && make -f Vmul_const.mk
+obj_dir/Vcontrol_loop_math: obj_dir/Vcontrol_loop_math.mk
+	cd obj_dir && make -f Vcontrol_loop_math.mk
 
-SEC_PER_CYCLE_WID=15
-CYCLE_COUNT_WID=18
-UNSAT_WID=(${SEC_PER_CYCLE_WID} + ${CYCLE_COUNT_WID})
-MAX_WID=48
-DT_WID=$(shell echo $$((${UNSAT_WID} > ${MAX_WID} ? ${MAX_WID} : ${UNSAT_WID})))
-
-obj_dir/Vcalculate_dt.mk: calculate_dt_sim.cpp calculate_dt.v ${COMMON}
-	verilator --cc --exe -Wall --trace --trace-fst \
-		--top-module calculate_dt \
-		-GSEC_PER_CYCLE_WID=${SEC_PER_CYCLE_WID} \
-		-GCYCLE_COUNT_WID=${CYCLE_COUNT_WID} \
-		-CFLAGS -DDT_WID=${DT_WID} \
-		calculate_dt.v calculate_dt_sim.cpp ${COMMON_CPP}
-obj_dir/Vcalculate_dt: obj_dir/Vcalculate_dt.mk
-	cd obj_dir && make -f Vcalculate_dt.mk
-
-test: obj_dir/Vcalculate_dt
-	obj_dir/Vcalculate_dt
diff --git a/firmware/rtl/control_loop/boothmul.v b/firmware/rtl/control_loop/boothmul.v
index b6b96d5..f235fb7 100644
--- a/firmware/rtl/control_loop/boothmul.v
+++ b/firmware/rtl/control_loop/boothmul.v
@@ -142,3 +142,4 @@ end
 `endif
 
 endmodule
+`undefineall
diff --git a/firmware/rtl/control_loop/calculate_dt.v b/firmware/rtl/control_loop/calculate_dt.v
deleted file mode 100644
index a5f1dc5..0000000
--- a/firmware/rtl/control_loop/calculate_dt.v
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Calculate and truncate Δt = cycles/100MhZ.
- * NOTE: boothmul is a SIGNED algorithm so both inputs are SIGNED.
- * This means that SEC_PER_CYCLE must have a leading 0
- * and that cycles must also have a leading zero.
- */
-
-`undefineall
-module calculate_dt #(
-	/* This number is 1/(clock cycle).
-	   The number is interpreted so the least significant bit
-	   coincides with the LSB of a constant. */
-	parameter SEC_PER_CYCLE_WID = 15,
-	parameter [SEC_PER_CYCLE_WID-1:0] SEC_PER_CYCLE = 'b010101011110011,
-	parameter CYCLE_COUNT_WID = 18,
-	parameter MAX_WID = 48
-) (
-	input clk,
-	input arm,
-	output finished,
-
-	input [CYCLE_COUNT_WID-1:0] cycles,
-
-/* Multiplication of Q18.0 and 14 lower siginifcant bits. */
-`define DT_WID_UNTRUNC (SEC_PER_CYCLE_WID + CYCLE_COUNT_WID)
-`define DT_WID (`DT_WID_UNTRUNC > MAX_WID ? MAX_WID : `DT_WID_UNTRUNC)
-
-	output [`DT_WID-1:0] dt
-);
-
-wire [`DT_WID_UNTRUNC-1:0] dt_untrunc;
-
-boothmul #(
-	.A1_LEN(CYCLE_COUNT_WID),
-	.A2_LEN(SEC_PER_CYCLE_WID)
-) mul (
-	.clk(clk),
-	.arm(arm),
-	.a1(cycles),
-	.a2(SEC_PER_CYCLE),
-	.outn(dt_untrunc),
-	.fin(finished)
-);
-
-generate if (`DT_WID_UNTRUNC > `DT_WID) begin
-	intsat #(
-		.IN_LEN(`DT_WID_UNTRUNC),
-		.LTRUNC(`DT_WID_UNTRUNC - `DT_WID)
-	) sat (
-		.inp(dt_untrunc),
-		.outp(dt)
-	);
-end else begin
-	assign dt = dt_untrunc;
-end endgenerate
-
-`ifdef VERILATOR
-initial begin
-	$dumpfile("calculate_dt.fst");
-	$dumpvars;
-end
-`endif
-
-endmodule
diff --git a/firmware/rtl/control_loop/calculate_dt_sim.cpp b/firmware/rtl/control_loop/calculate_dt_sim.cpp
deleted file mode 100644
index 2021171..0000000
--- a/firmware/rtl/control_loop/calculate_dt_sim.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-#include <cstdio>
-#include "control_loop_math_implementation.h"
-#include <verilated.h>
-#include "Vcalculate_dt.h"
-using ModType = Vcalculate_dt;
-
-uint32_t main_time = 0;
-double sc_time_stamp() {
-	return main_time;
-}
-
-ModType *mod;
-
-static void run_clock() {
-	for (int i = 0; i < 2; i++) {
-		mod->clk = !mod->clk;
-		mod->eval();
-		main_time++;
-	}
-}
-
-static void init(int argc, char **argv) {
-	Verilated::commandArgs(argc, argv);
-	Verilated::traceEverOn(true);
-	mod = new ModType;
-	mod->clk = 0;
-}
-
-int main(int argc, char **argv) {
-	int r = 0;
-
-	init(argc, argv);
-
-	for (V i = 1; i < ((1 << 17) - 1); i++) {
-		mod->cycles = i;
-		mod->arm = 1;
-		do { run_clock(); } while (!mod->finished);
-		mod->arm = 0;
-
-		V real_dt = calculate_dt(i, DT_WID);
-		if (mod->dt != real_dt) {
-			printf("(%lld) %lld != %lld\n", i, mod->dt, real_dt);
-			r = 1;
-			goto end;
-		}
-
-		struct fixed_point fxp = {
-			.val = real_dt,
-			.whole_len = 0,
-			.frac_len = 40
-		};
-
-		printf("%s\n", fxp_to_string(fxp).c_str());
-
-		run_clock();
-	}
-
-end:
-	mod->final();
-	delete mod;
-	return r;
-}
diff --git a/firmware/rtl/control_loop/control_loop_math.v b/firmware/rtl/control_loop/control_loop_math.v
index 3f3ae00..9ef5aec 100644
--- a/firmware/rtl/control_loop/control_loop_math.v
+++ b/firmware/rtl/control_loop/control_loop_math.v
@@ -1,4 +1,3 @@
-`undefineall
 /*************** Precision **************
  * The control loop is designed around these values, but generally
  * does not hardcode them.
@@ -20,225 +19,216 @@
  * Δt is cycles/100MHz. This makes Δt at least 10 ns, with a
  * maximum of 1 ms.
  *
- * [1 : sign][7: whole][40: fractional]
- * This is 127 to -128, with a resolution of 9.095e-13.
+ * [1 : sign][20: whole][43: fractional]
  */
 
 module control_loop_math #(
-	parameter CONSTS_WHOLE = 8,
-	parameter CONSTS_FRAC = 40,
+	parameter CONSTS_WHOLE = 21,
+	parameter CONSTS_FRAC = 43,
 `define CONSTS_WID (CONSTS_WHOLE + CONSTS_FRAC)
+	parameter CONSTS_SIZ=7,
 
-	parameter DAC_DATA_WID = 20,
 	parameter ADC_WID = 18,
+	parameter [`CONSTS_WID-1:0] SEC_PER_CYCLE = 'b10101011110011000,
+	parameter CYCLE_COUNT_WID = 18
 `define E_WID (ADC_WID + 1)
-	/* How large the intermediate value should be. This should hold the ADC
-	 * value /as an integer/ along with the P, I values.
-	 * The OUT_FRAC value should usually but not always be the same as CONSTS_FRAC.
-	 */
-	parameter OUT_WHOLE = 20,
-	parameter OUT_FRAC = 40
-`define OUT_WID (OUT_WHOLE + OUT_FRAC)
 ) (
 	input clk,
 	input arm,
 	output reg finished,
 
-	input [ADC_WID-1:0] setpt,
-	input [ADC_WID-1:0] measured,
-	input [`CONSTS_WID-1:0] cl_P,
-	input [`CONSTS_WID-1:0] cl_I,
-	input [CYCLE_COUNT_WID-1:0] cycles,
-	input [`ERR_WID-1:0] e_prev,
-	input [`OUT_WID-1:0] adjval_prev,
+	input signed [ADC_WID-1:0] setpt,
+	input signed [ADC_WID-1:0] measured,
+	input signed [`CONSTS_WID-1:0] cl_P,
+	input signed [`CONSTS_WID-1:0] cl_I,
+	input signed [CYCLE_COUNT_WID-1:0] cycles,
+	input signed [`E_WID-1:0] e_prev,
+	input signed [`CONSTS_WID-1:0] adjval_prev,
 
-	output reg [`ERR_WID-1:0] e_cur,
-	output [`OUT_WID-1:0] adj_val
+`ifdef DEBUG_CONTROL_LOOP_MATH
+	output reg [`CONSTS_WID-1:0] dt_reg,
+	output reg [`CONSTS_WID-1:0] idt_reg,
+	output reg [`CONSTS_WID-1:0] epidt_reg,
+	output reg [`CONSTS_WID-1:0] ep_reg,
+`endif
+
+	output reg signed [`E_WID-1:0] e_cur,
+	output signed [`CONSTS_WID-1:0] adj_val
 );
 
-/* Calculate current error */
-assign e_cur = setpt - measured;
-
-/**** Stage 1: calculate Δt = cycles/100MHz
- *    cycles: CYCLE_COUNT_WID.0
- *    SEC_PER_CYCLE: 0....SEC_PER_CYCLE_WID
- * -x--------------------------------
- *    dt_unsat: CYCLE_COUNT_WID + SEC_PER_CYCLE_WID
- *
- * Optimization note: the total width can be capped to below 1.
+/*******
+ * Multiplier segment.
+ * Multiplies two 64 bit numbers and right-saturate + truncates it
+ * to be a 64 bit output, according to fixed-point rules.
  */
 
-reg arm_stage_1 = 0;
-
-`define DT_UNSAT_WID (CYCLE_COUNT_WID + SEC_PER_CYCLE_WID)
-wire [`DT_UNSAT_WID-1:0] dt_unsat;
-wire mul_dt_fin;
+reg signed [`CONSTS_WID-1:0] a1;
+reg signed [`CONSTS_WID-1:0] a2;
+/* verilator lint_off UNUSED */
+wire signed [`CONSTS_WID+`CONSTS_WID-1:0] out_untrunc;
+wire mul_fin;
+reg mul_arm;
 
 boothmul #(
-	.A1_LEN(CYCLE_COUNT_WID),
-	.A2_LEN(SEC_PER_CYCLE_WID)
-) mul_dt (
+	.A1_LEN(`CONSTS_WID),
+	.A2_LEN(`CONSTS_WID),
+	.A2LEN_SIZ(CONSTS_SIZ)
+) multiplier (
+	.a1(a1),
+	.a2(a2),
 	.clk(clk),
-	.arm(arm_stage_1),
-	.a1(cycles),
-	.a2(SEC_PER_CYCLE),
-	.outn(dt_unsat),
-	.fin(mul_dt_fin)
+	.outn(out_untrunc),
+	.fin(mul_fin),
+	.arm(mul_arm)
 );
 
-`define DT_WID (`DT_UNSAT_WID > `CONSTS_WID ? `CONSTS_WID : `DT_UNSAT_WID)
-wire [`DT_WID-1:0] dt;
-
-`define DT_WHOLE (`DT_WID < `CONSTS_FRAC ? 0 : `CONSTS_FRAC - `DT_WID)
-`define DT_FRAC (`DT_WID - `DT_WHOLE)
-
-generate if (`DT_UNSAT_WID > `CONSTS_WID) begin
-	intsat #(
-		.IN_LEN(`DT_UNSAT_WID),
-		.LTRUNC(`DT_UNSAT_WID - `CONSTS_WID)
-	) insat_dt (
-		.inp(dt_unsat),
-		.outp(dt)
-	);
-end else begin
-	assign dt = dt_unsat;
-end endgenerate
-
-/**** Stage 2: Calculate P + IΔt
- *     I: CONSTS_WHOLE.CONSTS_FRAC
- *  x  dt: DT_WHOLE.DT_FRAC
- *-- -------------------------------
- *     Idt_unscaled:
- *-- --------------------------------
- *     Idt: CONSTS_WHOLE.CONSTS_FRAC
- *
- * Right-truncate DT_FRAC bits to ensure CONSTS_FRAC
- * Integer-sature the DT_WHOLE bits if it extends far enough
+/****************************
+ * QX.Y * QX.Y = Q(2X).(2Y)
+ * This right-truncation gets rid of the lowest Y bits.
+ * Q(2X).Y
  */
 
-wire stage2_finished;
-reg arm_stage2 = 0;
-wire [`CONSTS_WID-1:0] idt;
+`define OUT_RTRUNC_WID (`CONSTS_WID+`CONSTS_WID-CONSTS_FRAC)
+wire signed [`OUT_RTRUNC_WID-1:0] out_rtrunc
+	= out_untrunc[`CONSTS_WID+`CONSTS_WID-1:CONSTS_FRAC];
 
-mul_const #(
-	.CONSTS_WHOLE(CONSTS_WHOLE),
-	.CONSTS_FRAC(CONSTS_FRAC),
-	.IN_WHOLE(`DT_WHOLE),
-	.IN_FRAC(`DT_FRAC),
-	.OUT_WHOLE(CONSTS_WHOLE),
-	.OUT_FRAC(CONSTS_FRAC)
-) mul_const_idt (
-	.clk(clk),
-	.inp(dt),
-	.const_in(cl_I),
-	.arm(arm_stage2),
-	.outp(idt),
-	.finished(stage2_finished)
-);
+wire signed [`CONSTS_WID-1:0] mul_out;
 
-reg [`CONSTS_WID:0] pidt_untrunc;
-/* Assuming that the constraints on cl_P, I, and dt hold */
-wire [`CONSTS_WID-1:0] pidt = pidt_untrunc[`CONSTS_WID-1:0];
-
-/**** Stage 3: calculate e_t(P + IΔt) and P e_{t-1} ****/
-
-reg arm_stage3 = 0;
-
-wire epidt_finished;
-wire pe_finished;
-
-wire [`OUT_WID-1:0] epidt;
-mul_const #(
-	.CONSTS_WHOLE(`CONSTS_WHOLE),
-	.CONSTS_FRAC(`CONSTS_FRAC),
-	.IN_WHOLE(`E_WID),
-	.IN_FRAC(0),
-	.OUT_WHOLE(OUT_WHOLE),
-	.OUT_FRAC(OUT_FRAC)
-) mul_const_epidt (
-	.clk(clk),
-	.inp(e_cur),
-	.const_in(idt),
-	.arm(arm_stage3),
-	.outp(epidt),
-	.finished(epidt_finished)
-);
-
-wire [`OUT_WID-1:0] pe;
-mul_const #(
-	.CONSTS_WHOLE(`CONSTS_WHOLE),
-	.CONSTS_FRAC(`CONSTS_FRAC),
-	.IN_WHOLE(`ERR_WID),
-	.IN_FRAC(0),
-	.OUT_WHOLE(OUT_WHOLE),
-	.OUT_FRAC(OUT_FRAC)
-) mul_const_pe (
-	.clk(clk),
-	.inp(e_prev),
-	.const_in(idt),
-	.arm(arm_stage3),
-	.outp(pe),
-	.finished(epidt_finished)
-);
-
-reg [`OUT_WID+1:0] adj_val_utrunc;
-/* = prev_adj + epidt - pe; */
+/***************************
+ * Saturate higher X bits away.
+ * Q(2X).Y -> QX.Y
+ */
 
 intsat #(
-	.IN_LEN(`OUT_WID + 2),
-	.LTRUNC(2)
-) adj_val_sat (
-	.inp(adj_val_utrunc),
-	.outp(adj_val)
+	.IN_LEN(`OUT_RTRUNC_WID),
+	.LTRUNC(CONSTS_WHOLE)
+) multiplier_saturate (
+	.inp(out_rtrunc),
+	.outp(mul_out)
 );
 
-/******* State machine ********/
-localparam WAIT_ON_ARM = 0;
-localparam WAIT_ON_STAGE_1 = 1;
-localparam WAIT_ON_STAGE_2 = 2;
-localparam WAIT_ON_STAGE_3 = 3;
-localparam WAIT_ON_DISARM = 4;
+/*************************
+ * Safely get rid of high bit in addition.
+ ************************/
 
-localparam STATE_SIZ = 3;
-reg [STATE_SIZ-1:0] state = WAIT_ON_ARM;
+reg signed [`CONSTS_WID+1-1:0] add_sat;
+wire signed [`CONSTS_WID-1:0] saturated_add;
+
+intsat #(
+	.IN_LEN(`CONSTS_WID + 1),
+	.LTRUNC(1)
+) addition_saturate (
+	.inp(add_sat),
+	.outp(saturated_add)
+);
+
+localparam WAIT_ON_ARM = 0;
+localparam WAIT_ON_CALCULATE_DT = 1;
+localparam CALCULATE_IDT = 2;
+localparam CALCULATE_EPIDT = 3;
+localparam CALCULATE_EP = 4;
+localparam CALCULATE_A_PART_1 = 5;
+localparam CALCULATE_A_PART_2 = 6;
+localparam WAIT_ON_DISARM = 7;
+
+reg [4:0] state = WAIT_ON_ARM;
+reg signed [`CONSTS_WID+1-1:0] tmpstore = 0;
+wire signed [`CONSTS_WID-1:0] tmpstore_view = tmpstore[`CONSTS_WID-1:0];
 
 always @ (posedge clk) begin
-	case (state) begin
-	WAIT_ON_ARM: begin
+	case (state)
+	WAIT_ON_ARM:
 		if (arm) begin
-			arm_stage_1 <= 1;
-			state <= WAIT_ON_STAGE_1;
+			e_cur <= setpt - measured;
+
+			a1 <= SEC_PER_CYCLE;
+			/* No sign extension, cycles is positive */
+			a2 <= {{(CONSTS_WHOLE - CYCLE_COUNT_WID){1'b0}}, cycles, {(CONSTS_FRAC){1'b0}}};
+			mul_arm <= 1;
+			state <= WAIT_ON_CALCULATE_DT;
+		end else begin
+			finished <= 0;
 		end
+	WAIT_ON_CALCULATE_DT:
+		if (mul_fin) begin
+			mul_arm <= 0;
+
+			`ifdef DEBUG_CONTROL_LOOP_MATH
+				dt_reg <= mul_out;
+			`endif
+
+			a1 <= mul_out; /* a1 = Δt */
+			a2 <= cl_I;
+			state <= CALCULATE_IDT;
+		end
+	CALCULATE_IDT:
+		if (!mul_arm) begin
+			mul_arm <= 1;
+		end else if (mul_fin) begin
+			mul_arm <= 0;
+			add_sat <= (mul_out + cl_P);
+
+			`ifdef DEBUG_CONTROL_LOOP_MATH
+				idt_reg <= mul_out;
+			`endif
+
+			a2 <= {{(CONSTS_WHOLE-`E_WID){e_cur[`E_WID-1]}},e_cur, {(CONSTS_FRAC){1'b0}}};
+			state <= CALCULATE_EPIDT;
+		end
+	CALCULATE_EPIDT:
+		if (!mul_arm) begin
+			a1 <= saturated_add;
+			mul_arm <= 1;
+		end else if (mul_fin) begin
+			mul_arm <= 0;
+			tmpstore <= {mul_out[`CONSTS_WID-1],mul_out};
+
+			`ifdef DEBUG_CONTROL_LOOP_MATH
+				epidt_reg <= mul_out;
+			`endif
+
+			a1 <= cl_P;
+			a2 <= {{(CONSTS_WHOLE-`E_WID){e_prev[`E_WID-1]}},e_prev, {(CONSTS_FRAC){1'b0}}};
+			state <= CALCULATE_EP;
+		end
+	CALCULATE_EP:
+		if (!mul_arm) begin
+			mul_arm <= 1;
+		end else if (mul_fin) begin
+			`ifdef DEBUG_CONTROL_LOOP_MATH
+				ep_reg <= mul_out;
+			`endif
+
+			mul_arm <= 0;
+			add_sat <= (tmpstore_view - mul_out);
+			state <= CALCULATE_A_PART_1;
+		end
+	CALCULATE_A_PART_1: begin
+		tmpstore <= saturated_add + adjval_prev;
+		state <= CALCULATE_A_PART_2;
 	end
-	WAIT_ON_STAGE_1: begin
-		if (mul_scale_err_fin && mul_dt_fin) begin
-			arm_stage_1 <= 0;
-			arm_stage_2 <= 1;
-			state <= WAIT_ON_STAGE_2;
-		end
-	end
-	WAIT_ON_STAGE_2: begin
-		if (stage2_finished) begin
-			pidt_untrunc <= cl_P + idt;
-			arm_stage_2 <= 0;
-			arm_stage_3 <= 1;
-			state <= WAIT_ON_STAGE_3;
-		end
-	end
-	WAIT_ON_STAGE_3: begin
-		if (epidt_finished && pe_finished) begin
-			adj_val_utrunc <= prev_adj + epidt - pe;
-			arm_stage3 <= 0;
-			finished <= 1;
-			state <= WAIT_ON_DISARM;
-		end
+	CALCULATE_A_PART_2: begin
+		add_sat <= tmpstore;
+		state <= WAIT_ON_DISARM;
 	end
 	WAIT_ON_DISARM: begin
+		adj_val <= saturated_add;
 		if (!arm) begin
-			finished <= 0;
 			state <= WAIT_ON_ARM;
+			finished <= 0;
+		end else begin
+			finished <= 1;
 		end
 	end
+	endcase
 end
 
+`ifdef VERILATOR
+initial begin
+	$dumpfile("control_loop_math.fst");
+	$dumpvars;
+end
+`endif
+
 endmodule
+`undefineall
diff --git a/firmware/rtl/control_loop/control_loop_math_implementation.cpp b/firmware/rtl/control_loop/control_loop_math_implementation.cpp
index 089190c..3459da7 100644
--- a/firmware/rtl/control_loop/control_loop_math_implementation.cpp
+++ b/firmware/rtl/control_loop/control_loop_math_implementation.cpp
@@ -2,7 +2,14 @@
 
 #define BITMASK(n) (((V)1 << (n)) - 1)
 
-static V sat(V r, unsigned siz) {
+/* only works on 64 bit GCC/Clang, can use boost (eww boost) */
+
+static V sat(__int128_t r, unsigned siz, unsigned discard) {
+	r >>= discard;
+	/* Since this is signed numbers, the actual number of bits of
+	 * the largest number is one less than the bit size. */
+	siz -= 1;
+
 	if (r >= BITMASK(siz)) {
 		return BITMASK(siz);
 	} else if (r <= -BITMASK(siz)) {
@@ -10,19 +17,18 @@ static V sat(V r, unsigned siz) {
 		// make (siz - 1) zero bits
 		return allzero & (allzero << (siz - 1));
 	} else {
-		return r; 
+		return r;
 	}
 }
 
+V mulsat(V x, V y, unsigned siz, unsigned discard) {
+	__int128_t v = (__int128_t)x * (__int128_t)y;
 
-V calculate_dt(V cycles, unsigned siz) {
-	constexpr V sec_per_cycle = 0b10101011110011;
-
-	return sat(sec_per_cycle * cycles, siz);
+	return sat(v, siz, discard);
 }
 
-static char d2c(int c) {
-	switch (c % 10) {
+static int d2c(unsigned d) {
+	switch (d) {
 	case 0: return '0';
 	case 1: return '1';
 	case 2: return '2';
@@ -36,7 +42,6 @@ static char d2c(int c) {
 	default: return '?';
 	}
 }
-
 std::string fxp_to_string(const struct fixed_point &fxp) {
 	std::string r = std::to_string((fxp.val >> fxp.frac_len) & BITMASK(fxp.whole_len));
 	V frac = fxp.val & BITMASK(fxp.frac_len);
@@ -51,3 +56,27 @@ std::string fxp_to_string(const struct fixed_point &fxp) {
 
 	return r;
 }
+
+#if 0
+V asr (V x, unsigned len) {
+	if (x >= 0)
+		return x >> len;
+	x >>= len;
+
+	/* x is shifted-right by N bits. This makes a mask of
+	 * N bits, and shifts it to the highest position.
+	 */
+	V mask = ((1 << len) - 1) << (sizeof(x) * CHAR_BITS - len);
+	return mask | x;
+}
+#endif
+
+V sign_extend(V x, unsigned len) {
+	/* if high bit is 1 */
+	if (x >> (len - 1) & 1) {
+		V mask = (1 << len) - 1;
+		return ~mask | x;
+	} else {
+		return x;
+	}
+}
diff --git a/firmware/rtl/control_loop/control_loop_math_implementation.h b/firmware/rtl/control_loop/control_loop_math_implementation.h
index 158f4d8..8a8eda9 100644
--- a/firmware/rtl/control_loop/control_loop_math_implementation.h
+++ b/firmware/rtl/control_loop/control_loop_math_implementation.h
@@ -2,16 +2,46 @@
 #include <cstdint>
 #include <string>
 #include <utility>
+#include <vector>
 #include <limits>
+#include <random>
 
 using V = int64_t;
 constexpr V V_min = std::numeric_limits<V>::min();
 
+class Transfer {
+	std::default_random_engine generator;
+	std::normal_distribution<> dist;
+	double scale;
+	double m;
+	double b;
+
+	double sample() {return scale*dist(generator);}
+
+	public:
+	Transfer(double scale, double mean, double dev, double m, double b, int seed)
+	: scale{scale}, dist{mean,dev}, generator{}, m{m}, b{b} {
+		if (seed < 0) {
+			std::random_device rd;
+			generator.seed(rd());
+		} else {
+			generator.seed(seed);
+		}
+	}
+
+	int64_t val(double x) {
+		return m*x + b + sample();
+	}
+};
+
+V mulsat(V x, V y, unsigned siz, unsigned discard);
+
 struct fixed_point {
 	V val;
 	unsigned whole_len;
 	unsigned frac_len;
 };
 
-V calculate_dt(V cycles, unsigned siz);
 std::string fxp_to_string(const struct fixed_point &fxp);
+// V asr(V x, unsigned len);
+V sign_extend(V x, unsigned len);
diff --git a/firmware/rtl/control_loop/control_loop_math_sim.cpp b/firmware/rtl/control_loop/control_loop_math_sim.cpp
new file mode 100644
index 0000000..559683f
--- /dev/null
+++ b/firmware/rtl/control_loop/control_loop_math_sim.cpp
@@ -0,0 +1,108 @@
+#include <cstdio>
+#include <cstdint>
+#include "control_loop_math_implementation.h"
+#include "Vcontrol_loop_math.h"
+using ModType = Vcontrol_loop_math;
+
+uint32_t main_time = 0;
+double sc_time_stamp() {
+	return main_time;
+}
+
+ModType *mod;
+
+static void run_clock() {
+	for (int i = 0; i < 2; i++) {
+		mod->clk = !mod->clk;
+		mod->eval();
+		main_time++;
+	}
+}
+
+static void init(int argc, char **argv) {
+	Verilated::commandArgs(argc, argv);
+	Verilated::traceEverOn(true);
+	mod = new ModType;
+	mod->clk = 0;
+}
+
+#define MASK(n) ((1 << (n)) - 1)
+using V = int64_t;
+
+constexpr V per100 = 0b010101011110011000;
+
+static void calculate() {
+	/* Multiplication adds an extra CONSTS_FRAC bits to the end,
+	 * truncate them. */
+
+	V err_cur = (V)mod->setpt - (V)mod->measured;
+	V dt = mulsat(per100, (V)mod->cycles << CONSTS_FRAC, 64, CONSTS_FRAC);
+	V idt = mulsat(dt, mod->cl_I, 64, CONSTS_FRAC);
+	V epidt = mulsat(err_cur << CONSTS_FRAC, mod->cl_P + idt, 64, CONSTS_FRAC);
+	V ep = mulsat((V)mod->e_prev << CONSTS_FRAC, mod->cl_P, 64, CONSTS_FRAC);
+	V new_adjval = mod->adjval_prev + epidt - ep;
+
+	mod->arm = 1;
+
+	do {
+		run_clock();
+	} while (!mod->finished);
+
+	mod->arm = 0;
+	run_clock();
+	run_clock();
+
+#if 0
+	/* Stupid bug: verilator does not sign-extend signed ports */
+
+	printf("err_cur %ld %ld\n", err_cur, sign_extend(mod->e_cur, E_WID));
+	printf("dt %ld %ld\n", dt, mod->dt_reg);
+	printf("idt %ld %ld\n", idt, mod->idt_reg);
+	printf("epidt %ld %ld\n", epidt, mod->epidt_reg);
+	printf("ep %ld %ld\n", ep, mod->ep_reg);
+	printf("adj %ld %ld\n", new_adjval, mod->adj_val);
+#endif
+}
+
+int main(int argc, char **argv) {
+	init(argc, argv);
+	mod->arm = 0;
+	run_clock();
+	Transfer func = Transfer{150, 0, 2, 1.1, 10, -1};
+
+	/* Initial conditions */
+	mod->setpt = 10000;
+	mod->cl_P = 0b11010111000010100011110101110000101000111; /* 0.21 */
+	mod->cl_I = (V)12 << CONSTS_FRAC;
+	mod->cycles = 20; /* dummy number for now */
+	mod->e_prev = 0;
+	mod->adjval_prev = 0;
+
+	V setting = 100000;
+
+	printf("running\n");
+	for (int i = 0; i < 200; i++) {
+		mod->measured = func.val(setting);
+
+		calculate();
+		mod->e_prev = mod->e_cur;
+		mod->adjval_prev = mod->adj_val;
+
+		/* C++ has no standard arithmetic right shift */
+		V adj;
+
+		if ((V)mod->adj_val > 0) {
+			adj = mod->adj_val >> CONSTS_FRAC;
+		} else {
+			adj = -((-mod->adj_val) >> CONSTS_FRAC);
+		}
+
+		printf("#%d: setting: %ld, measured: %ld, setpt: %ld, adj: %ld\n", i, setting, mod->measured, mod->setpt, adj);
+
+		setting += adj;
+	}
+
+	mod->final();
+	delete mod;
+	return 0;
+}
diff --git a/firmware/rtl/control_loop/mul_const.v b/firmware/rtl/control_loop/mul_const.v
deleted file mode 100644
index cefb0af..0000000
--- a/firmware/rtl/control_loop/mul_const.v
+++ /dev/null
@@ -1,65 +0,0 @@
-module mul_const #(
-	parameter CONSTS_WHOLE = 8,
-	parameter CONSTS_FRAC = 40,
-`define CONSTS_WID (CONSTS_WHOLE + CONSTS_FRAC)
-	parameter IN_WHOLE = CONSTS_WHOLE,
-	parameter IN_FRAC = CONSTS_FRAC,
-`define IN_WID (IN_WHOLE + IN_FRAC)
-	parameter OUT_WHOLE = 20,
-	parameter OUT_FRAC = 40
-`define OUT_WID (OUT_WHOLE + OUT_FRAC)
-) (
-	input clk,
-	input signed [`IN_WID-1:0] inp,
-	input signed [`CONSTS_WID-1:0] const_in,
-	input arm,
-
-	output signed [`OUT_WID-1:0] outp,
-	output finished
-);
-
-`define UNSAT_WID (`CONSTS_WID + `IN_WID)
-wire signed [`UNSAT_WID-1:0] unsat;
-
-boothmul #(
-	.A1_LEN(`CONSTS_WID),
-	.A2_LEN(`IN_WID)
-) mul (
-	.clk(clk),
-	.arm(arm),
-	.a1(const_in),
-	.a2(inp),
-	.outn(unsat),
-	.fin(finished)
-);
-
-`define RIGHTTRUNC_WID (CONSTS_WHOLE + IN_WHOLE + OUT_FRAC)
-`define UNSAT_FRAC (CONSTS_FRAC + IN_FRAC)
-wire signed [`RIGHTTRUNC_WID-1:0] rtrunc =
-	unsat[`UNSAT_WID-1:(`UNSAT_FRAC - OUT_FRAC)];
-
-generate if (OUT_WHOLE < CONSTS_WHOLE + IN_WHOLE) begin
-	intsat #(
-		.IN_LEN(`RIGHTTRUNC_WID),
-		.LTRUNC(CONSTS_WHOLE + IN_WHOLE - OUT_WHOLE)
-	) sat (
-		.inp(rtrunc),
-		.outp(outp)
-	);
-end else if (OUT_WHOLE == CONSTS_WHOLE + IN_WHOLE) begin
-	assign outp = rtrunc;
-end else begin
-	assign outp[`RIGHTTRUNC_WID-1:0] = rtrunc;
-	assign outp[`OUT_WID-1:`RIGHTTRUNC_WID] = {
-		(`OUT_WID-`RIGHTTRUNC_WID){rtrunc[`RIGHTTRUNC_WID-1]}
-	};
-end endgenerate
-
-`ifdef VERILATOR
-initial begin
-	$dumpfile("mul_const.fst");
-	$dumpvars();
-end
-`endif
-
-endmodule
diff --git a/firmware/rtl/control_loop/mul_const_sim.cpp b/firmware/rtl/control_loop/mul_const_sim.cpp
deleted file mode 100644
index 3661b62..0000000
--- a/firmware/rtl/control_loop/mul_const_sim.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <cstdio>
-#include <cstdint>
-#include "Vmul_const.h"
-using ModType = Vmul_const;
-
-uint32_t main_time = 0;
-double sc_time_stamp() {
-	return main_time;
-}
-
-ModType *mod;
-
-static void run_clock() {
-	for (i = 0; i < 2; i++) {
-		mod->clk = !mod->clk;
-		mod->eval();
-		main_time++;
-	}
-}
-
-static void init(int argc, char **argv) {
-	Verilator::commandArgs(argc, argv);
-	Verilated::traceEverOn(true);
-	mod = new ModType;
-	mod->clk = 0;
-}
-
-#define BITMASK(n) ((1 << (n)) - 1)
-
-static void satmul(int64_t const_in, int64_t inp) {
-	int64_t r = const_in * inp;
-	if (r >= BITMASK(48)) {
-		return BITMASK(48);
-	} else if (r <= -BITMASK(48)) {
-		V allzero = ~((V) 0);
-		// make (siz - 1) zero bits
-		return allzero & (allzero << (siz - 1));
-	} else {
-		return r; 
-	}
-}
-
-#define RUNS 10000
-static void run(uint64_t const_in, uint64_t inp) {
-	const_in &= BITMASK(48);
-	inp &= BITMASK(IN_WID);
-
-	mod->inp = inp;
-	mod->const_in = const_in;
-	mod->arm = 1;
-
-	while (!mod->finished)
-		run_clock();
-	mod->finished = 0;
-	run_clock();
-
-
-	int64_t real_result = satmul(const_in, inp);
-
-	if (real_result != outp) {
-		printf("%llX * %llX = %llX (got %llX)\n",
-			std::reinterpret_cast<uint64_t>(const_in),
-			std::reinterpret_cast<uint64_t>(inp),
-			std::reinterpret_cast<uint64_t>(real_result),
-			std::reinterpret-cast<uint64_t>(outp));
-		exit(1);
-	}
-}
-
-int main(int argc, char **argv) {
-	run_clock();
-
-	for (int i = 0; i < RUNS; i++) {
-		run(rand() - rand(), rand() - rand());
-	}
-
-	return 0;
-}
diff --git a/firmware/rtl/sign_extend.v b/firmware/rtl/control_loop/sign_extend.v
similarity index 100%
rename from firmware/rtl/sign_extend.v
rename to firmware/rtl/control_loop/sign_extend.v