genlib/cordic: cleanup, documentation, unittests

2025-01-04 09:52:26 -05:00 · 2013-11-30 06:55:01 -07:00 · 2013-11-30 06:55:01 -07:00 · 9762546c95
commit 9762546c95
parent e54fa6f5f4
4 changed files with 584 additions and 192 deletions
--- a/doc/api.rst
+++ b/doc/api.rst
@ -21,3 +21,10 @@ migen API Documentation
 .. automodule:: migen.genlib.coding
        :members:
        :show-inheritance:
+
+:mod:`genlib.cordic` Module
+------------------------------
+
+.. automodule:: migen.genlib.cordic
+        :members:
+        :show-inheritance:
--- a/examples/sim/cordic_err.py
+++ b/examples/sim/cordic_err.py
@ -0,0 +1,115 @@
+import random
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from migen.fhdl.std import *
+from migen.fhdl import verilog
+from migen.genlib.cordic import Cordic
+from migen.sim.generic import Simulator
+
+
+class TestBench(Module):
+	def __init__(self, n=None, xmax=.98, i=None, **kwargs):
+		self.submodules.cordic = Cordic(**kwargs)
+		if n is None:
+			n = 1<<flen(self.cordic.xi)
+		self.c = c = 2**(flen(self.cordic.xi) - 1)
+		self.cz = cz = 2**(flen(self.cordic.zi) - 1)
+		if i is None:
+			i = [(int(xmax*c/self.cordic.gain), 0, int(cz*(i/n - .5)))
+					for i in range(n)]
+		self.i = i
+		random.shuffle(self.i)
+		self.ii = iter(self.i)
+		self.o = []
+
+	def do_simulation(self, s):
+		if s.rd(self.cordic.new_in):
+			try:
+				xi, yi, zi = next(self.ii)
+			except StopIteration:
+				s.interrupt = True
+				return
+			s.wr(self.cordic.xi, xi)
+			s.wr(self.cordic.yi, yi)
+			s.wr(self.cordic.zi, zi)
+		if s.rd(self.cordic.new_out):
+			xo = s.rd(self.cordic.xo)
+			yo = s.rd(self.cordic.yo)
+			zo = s.rd(self.cordic.zo)
+			self.o.append((xo, yo, zo))
+
+	def run_io(self): 
+		with Simulator(self) as sim:
+			sim.run()
+		del self.i[-1], self.o[0]
+		if self.i[0] != (0, 0, 0):
+			assert self.o[0] != (0, 0, 0)
+		if self.i[-1] != self.i[-2]:
+			assert self.o[-1] != self.o[-2], self.o[-2:]
+
+def rms_err(width, guard=None, stages=None, n=None):
+	tb = TestBench(width=width, guard=guard, stages=stages,
+			n=n, eval_mode="combinatorial")
+	tb.run_io()
+	c = 2**(flen(tb.cordic.xi) - 1)
+	cz = 2**(flen(tb.cordic.zi) - 1)
+	g = tb.cordic.gain
+	xi, yi, zi = np.array(tb.i).T/c
+	zi *= c/cz*tb.cordic.zmax
+	xo1, yo1, zo1 = np.array(tb.o).T
+	xo = np.floor(c*g*(np.cos(zi)*xi - np.sin(zi)*yi))
+	yo = np.floor(c*g*(np.sin(zi)*xi + np.cos(zi)*yi))
+	dx = xo1 - xo
+	dy = yo1 - yo
+	mm = np.fabs([dx, dy]).max()
+	rms = np.sqrt(dx**2 + dy**2).sum()/len(xo)
+	return rms, mm
+
+def rms_err_map():
+	widths, stages = np.mgrid[8:33:1, 8:37:1]
+	errf = np.vectorize(lambda w, s: _rms_err(int(w), None, int(s), n=333))
+	err = errf(widths, stages)
+	print(err)
+	lev = np.arange(10)
+	fig, ax = plt.subplots()
+	c1 = ax.contour(widths, stages, err[0], lev/10, cmap=plt.cm.Greys_r)
+	c2 = ax.contour(widths, stages, err[1], lev, cmap=plt.cm.Reds_r)
+	ax.plot(widths[:, 0], stages[0, np.argmin(err[0], 1)], "ko")
+	ax.plot(widths[:, 0], stages[0, np.argmin(err[1], 1)], "ro")
+	print(widths[:, 0], stages[0, np.argmin(err[0], 1)],
+			stages[0, np.argmin(err[1], 1)])
+	ax.set_xlabel("width")
+	ax.set_ylabel("stages")
+	ax.grid("on")
+	fig.colorbar(c1)
+	fig.colorbar(c2)
+	fig.savefig("cordic_rms.pdf")
+
+def plot_function(**kwargs):
+	tb = TestBench(eval_mode="combinatorial", **kwargs)
+	tb.run_io()
+	c = 2**(flen(tb.cordic.xi) - 1)
+	cz = 2**(flen(tb.cordic.zi) - 1)
+	g = tb.cordic.gain
+	xi, yi, zi = np.array(tb.i).T
+	xo, yo, zo = np.array(tb.o).T
+	fig, ax = plt.subplots()
+	ax.plot(zi, xo, "r,")
+	ax.plot(zi, yo, "g,")
+	ax.plot(zi, zo, "g,")
+
+
+if __name__ == "__main__":
+	c = Cordic(width=16, guard=None, eval_mode="combinatorial")
+	print(verilog.convert(c, ios={c.xi, c.yi, c.zi, c.xo, c.yo, c.zo,
+		c.new_in, c.new_out}))
+	#print(rms_err(8))
+	#rms_err_map()
+	#plot_function(func_mode="hyperbolic", xmax=.3, width=16, n=333)
+	#plot_function(func_mode="circular", width=16, n=333)
+	#plot_function(func_mode="hyperbolic", cordic_mode="vector",
+    #        xmax=.3, width=16, n=333)
+	#plot_function(func_mode="circular", width=16, n=333)
+	plt.show()
--- a/migen/genlib/cordic.py
+++ b/migen/genlib/cordic.py
@ -1,71 +1,202 @@
-from math import atan, atanh, log, sqrt, pi
+from math import atan, atanh, log, sqrt, pi, ceil

 from migen.fhdl.std import *

 class TwoQuadrantCordic(Module):
-	"""
+	"""Coordinate rotation digital computer
+
+	Trigonometric, and arithmetic functions implemented using
+	additions/subtractions and shifts.
+
 	http://eprints.soton.ac.uk/267873/1/tcas1_cordic_review.pdf
+
+	http://www.andraka.com/files/crdcsrvy.pdf
+
+	http://zatto.free.fr/manual/Volder_CORDIC.pdf
+
+	The way the CORDIC is executed is controlled by `eval_mode`.
+	If `"iterative"` the stages are iteratively evaluated, one per clock
+	cycle. This mode uses the least amount of registers, but has the
+	lowest throughput and highest latency.  If `"pipelined"` all stages
+	are executed in every clock cycle but separated by registers.  This
+	mode has full throughput but uses many registers and has large
+	latency. If `"combinatorial"`, there are no registers, throughput is
+	maximal and latency is zero. `"pipelined"` and `"combinatorial"` use
+	the same number of sphifters and adders.
+
+	The type of trigonometric/arithmetic function is determined by
+	`cordic_mode` and `func_mode`. :math:`g` is the gain of the CORDIC.
+
+		* rotate-circular: rotate the vector `(xi, yi)` by an angle `zi`.
+		  Used to calculate trigonometric functions, `sin(), cos(),
+		  tan() = sin()/cos()`, or to perform polar-to-cartesian coordinate
+		  transformation:
+
+			.. math::
+				x_o = g \\cos(z_i) x_i - g \\sin(z_i) y_i
+
+				y_o = g \\sin(z_i) x_i + g \\cos(z_i) y_i
+
+		* vector-circular: determine length and angle of the vector
+		  `(xi, yi)`.  Used to calculate `arctan(), sqrt()` or
+		  to perform cartesian-to-polar transformation:
+
+			.. math::
+				x_o = g\\sqrt{x_i^2 + y_i^2}
+
+				z_o = z_i + \\tan^{-1}(y_i/x_i)
+
+		* rotate-hyperbolic: hyperbolic functions of `zi`. Used to
+		  calculate hyprbolic functions, `sinh, cosh, tanh = cosh/sinh,
+		  exp = cosh + sinh`:
+
+			.. math::
+				x_o = g \\cosh(z_i) x_i + g \\sinh(z_i) y_i
+
+				y_o = g \\sinh(z_i) x_i + g \\cosh(z_i) z_i
+
+		* vector-hyperbolic: natural logarithm `ln(), arctanh()`, and
+		  `sqrt()`. Use `x_i = a + b` and `y_i = a - b` to obtain `2*
+		  sqrt(a*b)` and `ln(a/b)/2`:
+
+			.. math::
+				x_o = g\\sqrt{x_i^2 - y_i^2}
+
+				z_o = z_i + \\tanh^{-1}(y_i/x_i)
+
+		* rotate-linear: multiply and accumulate (not a very good
+		  multiplier implementation):
+
+			.. math::
+				y_o = g(y_i + x_i z_i)
+
+		* vector-linear: divide and accumulate:
+
+			.. math::
+				z_o = g(z_i + y_i/x_i)
+
+	Parameters
+	----------
+	width : int
+		Bit width of the input and output signals. Defaults to 16. Input
+		and output signals are signed.
+	widthz : int
+		Bit with of `zi` and `zo`. Defaults to the `width`.
+	stages : int or None
+		Number of CORDIC incremental rotation stages. Defaults to
+		`width + min(1, guard)`.
+	guard : int or None
+		Add guard bits to the intermediate signals. If `None`,
+		defaults to `guard = log2(width)` which guarantees accuracy
+		to `width` bits.
+	eval_mode : str, {"iterative", "pipelined", "combinatorial"}
+	cordic_mode : str, {"rotate", "vector"}
+	func_mode : str, {"circular", "linear", "hyperbolic"}
+		Evaluation and arithmetic mode. See above.
+
+	Attributes
+	----------
+	xi, yi, zi : Signal(width), in
+		Input values, signed.
+	xo, yo, zo : Signal(width), out
+		Output values, signed.
+	new_out : Signal(1), out
+		Asserted if output values are freshly updated in the current
+		cycle.
+	new_in : Signal(1), out
+		Asserted if new input values are being read in the next cycle.
+	zmax : float
+		`zi` and `zo` normalization factor. Floating point `zmax`
+		corresponds to `1<<(widthz - 1)`. `x` and `y` are scaled such
+		that floating point `1` corresponds to `1<<(width - 1)`.
+	gain : float
+		Cumulative, intrinsic gain and scaling factor. In circular mode
+		`sqrt(xi**2 + yi**2)` should be no larger than `2**(width - 1)/gain`
+		to prevent overflow. Additionally, in hyperbolic and linear mode,
+		the operation itself can cause overflow.
+	interval : int
+		Output interval in clock cycles. Inverse throughput.
+	latency : int
+		Input-to-output latency. The result corresponding to the inputs
+		appears at the outputs `latency` cycles later.
+
+	Notes
+	-----
+
+	Each stage `i` in the CORDIC performs the following operation:
+
+	.. math::
+		x_{i+1} = x_i - m d_i y_i r^{-s_{m,i}},
+
+		y_{i+1} = y_i + d_i x_i r^{-s_{m,i}},
+
+		z_{i+1} = z_i - d_i a_{m,i},
+
+	where:
+
+		* :math:`d_i`: clockwise or counterclockwise, determined by
+		  `sign(z_i)` in rotate mode or `sign(-y_i)` in vector mode.
+
+		* :math:`r`: radix of the number system (2)
+
+		* :math:`m`: 1: circular, 0: linear, -1: hyperbolic
+
+		* :math:`s_{m,i}`: non decreasing integer shift sequence
+
+		* :math:`a_{m,i}`: elemetary rotation angle: :math:`a_{m,i} =
+		  \\tan^{-1}(\\sqrt{m} s_{m,i})/\\sqrt{m}`.
 	"""
-	def __init__(self, width=16, stages=None, guard=0,
+	def __init__(self, width=16, widthz=None, stages=None, guard=0,
 			eval_mode="iterative", cordic_mode="rotate",
 			func_mode="circular"):
 		# validate parameters
 		assert eval_mode in ("combinatorial", "pipelined", "iterative")
 		assert cordic_mode in ("rotate", "vector")
-		self.cordic_mode = cordic_mode
 		assert func_mode in ("circular", "linear", "hyperbolic")
+		self.cordic_mode = cordic_mode
 		self.func_mode = func_mode
 		if guard is None:
 			# guard bits to guarantee "width" accuracy
 			guard = int(log(width)/log(2))
+		if widthz is None:
+			widthz = width
 		if stages is None:
-			stages = width + guard
+			stages = width + min(1, guard) # cuts error below LSB

-		# calculate the constants
-		if func_mode == "circular":
-			s = range(stages)
-			a = [atan(2**-i) for i in s]
-			g = [sqrt(1 + 2**(-2*i)) for i in s]
-			zmax = pi/2
-		elif func_mode == "linear":
-			s = range(stages)
-			a = [2**-i for i in s]
-			g = [1 for i in s]
-			zmax = 2.
-		elif func_mode == "hyperbolic":
-			s = list(range(1, stages+1))
-			# need to repeat these stages:
-			j = 4
-			while j < stages+1:
-				s.append(j)
-				j = 3*j + 1
-			s.sort()
-			stages = len(s)
-			a = [atanh(2**-i) for i in s]
-			g = [sqrt(1 - 2**(-2*i)) for i in s]
-			zmax = 1.
+		# input output interface
+		self.xi = Signal((width, True))
+		self.yi = Signal((width, True))
+		self.zi = Signal((widthz, True))
+		self.xo = Signal((width, True))
+		self.yo = Signal((width, True))
+		self.zo = Signal((widthz, True))
+		self.new_in = Signal()
+		self.new_out = Signal()

-		a = [Signal((width+guard, True), "{}{}".format("a", i),
-			reset=int(round(ai*2**(width + guard - 1)/zmax)))
-			for i, ai in enumerate(a)]
-		self.zmax = zmax #/2**(width - 1)
-		self.gain = 1.
-		for gi in g:
-			self.gain *= gi
+		###

-		exec_target, num_reg, self.latency, self.interval = {
-			"combinatorial": (self.comb, stages + 1, 0,          1),
-			"pipelined":     (self.sync, stages + 1, stages,     1),
-			"iterative":     (self.sync, 3,          stages + 1, stages + 1),
-			}[eval_mode]
+		a, s, self.zmax, self.gain = self._constants(stages, widthz + guard)
+		stages = len(a) # may have increased due to repetitions

-		# i/o and inter-stage signals
-		self.fresh = Signal()
-		self.xi, self.yi, self.zi, self.xo, self.yo, self.zo = (
-				Signal((width, True), l + io) for io in "io" for l in "xyz")
-		x, y, z = ([Signal((width + guard, True), "{}{}".format(l, i))
-			for i in range(num_reg)] for l in "xyz")
+		if eval_mode == "iterative":
+			num_sig = 3
+			self.interval = stages + 1
+			self.latency = stages + 2
+		else:
+			num_sig = stages + 1
+			self.interval = 1
+			if eval_mode == "pipelined":
+				self.latency = stages
+			else: # combinatorial
+				self.latency = 0

+		# inter-stage signals
+		x = [Signal((width + guard, True)) for i in range(num_sig)]
+		y = [Signal((width + guard, True)) for i in range(num_sig)]
+		z = [Signal((widthz + guard, True)) for i in range(num_sig)]
+
+		# hook up inputs and outputs to the first and last inter-stage
+		# signals
 		self.comb += [
 			x[0].eq(self.xi<<guard),
 			y[0].eq(self.yi<<guard),
@ -75,167 +206,143 @@ class TwoQuadrantCordic(Module):
 			self.zo.eq(z[-1]>>guard),
 			]

-		if eval_mode in ("combinatorial", "pipelined"):
-			self.comb += self.fresh.eq(1)
-			for i in range(stages):
-				exec_target += self.stage(x[i], y[i], z[i],
-						x[i + 1], y[i + 1], z[i + 1], i, a[i])
-		elif eval_mode == "iterative":
-			# we afford one additional iteration for register in/out
-			# shifting, trades muxes for registers
+		if eval_mode == "iterative":
+			# We afford one additional iteration for in/out.
 			i = Signal(max=stages + 1)
-			ai = Signal((width+guard, True))
-			self.comb += ai.eq(Array(a)[i])
-			exec_target += [
+			self.comb += [
+					self.new_in.eq(i == stages),
+					self.new_out.eq(i == 1),
+					]
+			ai = Signal((widthz + guard, True))
+			self.sync += ai.eq(Array(a)[i])
+			if range(stages) == s:
+				si = i - 1 # shortcut if no stage repetitions
+			else:
+				si = Signal(max=stages + 1)
+				self.sync += si.eq(Array(s)[i])
+			xi, yi, zi = x[1], y[1], z[1]
+			self.sync += [
+					self._stage(xi, yi, zi, xi, yi, zi, si, ai),
 					i.eq(i + 1),
 					If(i == stages,
 						i.eq(0),
-						self.fresh.eq(1),
-						Cat(x[1], y[1], z[1]).eq(Cat(x[0], y[0], z[0])),
-						Cat(x[2], y[2], z[2]).eq(Cat(x[1], y[1], z[1])),
-					).Else(
-						self.fresh.eq(0),
-						# in-place stages
-						self.stage(x[1], y[1], z[1], x[1], y[1], z[1], i, ai),
+					),
+					If(i == 0,
+						x[2].eq(xi),
+						y[2].eq(yi),
+						z[2].eq(zi),
+						xi.eq(x[0]),
+						yi.eq(y[0]),
+						zi.eq(z[0]),
 					)]
+		else:
+			self.comb += [
+					self.new_out.eq(1),
+					self.new_in.eq(1),
+					]
+			for i, si in enumerate(s):
+				stmt = self._stage(x[i], y[i], z[i],
+						x[i + 1], y[i + 1], z[i + 1], si, a[i])
+				if eval_mode == "pipelined":
+					self.sync += stmt
+				else: # combinatorial
+					self.comb += stmt

-	def stage(self, xi, yi, zi, xo, yo, zo, i, a):
-		"""
-		x_{i+1} = x_{i} - m*d_i*y_i*r**(-s_{m,i})
-		y_{i+1} = d_i*x_i*r**(-s_{m,i}) + y_i
-		z_{i+1} = z_i - d_i*a_{m,i}
+	def _constants(self, stages, bits):
+		if self.func_mode == "circular":
+			s = range(stages)
+			a = [atan(2**-i) for i in s]
+			g = [sqrt(1 + 2**(-2*i)) for i in s]
+			#zmax = sum(a)
+			# use pi anyway as the input z can cause overflow
+			# and we need the range for quadrant mapping
+			zmax = pi
+		elif self.func_mode == "linear":
+			s = range(stages)
+			a = [2**-i for i in s]
+			g = [1 for i in s]
+			#zmax = sum(a)
+			# use 2 anyway as this simplifies a and scaling
+			zmax = 2.
+		else: # hyperbolic
+			s = []
+			# need to repeat some stages:
+			j = 4
+			for i in range(stages):
+				if i == j:
+					s.append(j)
+					j = 3*j + 1
+				s.append(i + 1)
+			a = [atanh(2**-i) for i in s]
+			g = [sqrt(1 - 2**(-2*i)) for i in s]
+			zmax = sum(a)*2
+		a = [int(ai*2**(bits - 1)/zmax) for ai in a]
+		# round here helps the width=2**i - 1 case but hurts the
+		# important width=2**i case
+		gain = 1.
+		for gi in g:
+			gain *= gi
+		return a, s, zmax, gain

-		d_i: clockwise or counterclockwise
-		r: radix of the number system
-		m: 1: circular, 0: linear, -1: hyperbolic
-		s_{m,i}: non decreasing integer shift sequence
-		a_{m,i}: elemetary rotation angle
-		"""
-		dx, dy, dz = xi>>i, yi>>i, a
-		direction = {"rotate": zi < 0, "vector": yi >= 0}[self.cordic_mode]
-		dy = {"circular": dy, "linear": 0, "hyperbolic": -dy}[self.func_mode]
-		ret = If(direction,
-					xo.eq(xi + dy),
-					yo.eq(yi - dx),
+	def _stage(self, xi, yi, zi, xo, yo, zo, i, ai):
+		if self.cordic_mode == "rotate":
+			direction = zi < 0
+		else: # vector
+			direction = yi >= 0
+		dx = yi>>i
+		dy = xi>>i
+		dz = ai
+		if self.func_mode == "linear":
+			dx = 0
+		elif self.func_mode == "hyperbolic":
+			dx = -dx
+		stmt = If(direction,
+					xo.eq(xi + dx),
+					yo.eq(yi - dy),
 					zo.eq(zi + dz),
 				).Else(
-					xo.eq(xi - dy),
-					yo.eq(yi + dx),
+					xo.eq(xi - dx),
+					yo.eq(yi + dy),
 					zo.eq(zi - dz),
 				)
-		return ret
+		return stmt

 class Cordic(TwoQuadrantCordic):
+	"""Four-quadrant CORDIC
+
+	Same as :class:`TwoQuadrantCordic` but with support and convergence
+	for `abs(zi) > pi/2 in circular rotate mode or `xi < 0` in circular
+	vector mode.
+	"""
 	def __init__(self, **kwargs):
 		TwoQuadrantCordic.__init__(self, **kwargs)
-		if not (self.func_mode, self.cordic_mode) == ("circular", "rotate"):
+		if self.func_mode != "circular":
 			return # no need to remap quadrants
-		cxi, cyi, czi, cxo, cyo, czo = (self.xi, self.yi, self.zi,
-				self.xo, self.yo, self.zo)
+
 		width = flen(self.xi)
-		for l in "xyz":
-			for d in "io":
-				setattr(self, l+d, Signal((width, True), l+d))
-		qin = Signal()
-		qout = Signal()
-		if self.latency == 0:
-			self.comb += qout.eq(qin)
-		elif self.latency == 1:
-			self.sync += qout.eq(qin)
-		else:
-			sr = Signal(self.latency-1)
-			self.sync += Cat(sr, qout).eq(Cat(qin, sr))
-		pi2 = (1<<(width-2))-1
-		self.zmax *= 2
+		widthz = flen(self.zi)
+		cxi, cyi, czi = self.xi, self.yi, self.zi
+		self.xi = Signal((width, True))
+		self.yi = Signal((width, True))
+		self.zi = Signal((widthz, True))
+
+		###
+
+		pi2 = 1<<(widthz - 2)
+		if self.cordic_mode == "rotate":
+			#rot = self.zi + pi2 < 0
+			rot = self.zi[-1] ^ self.zi[-2]
+		else: # vector
+			rot = self.xi < 0
+			#rot = self.xi[-1]
 		self.comb += [
-				# zi, zo are scaled to cover the range, this also takes
-				# care of mapping the zi quadrants
-				Cat(cxi, cyi, czi).eq(Cat(self.xi, self.yi, self.zi<<1)),
-				Cat(self.xo, self.yo, self.zo).eq(Cat(cxo, cyo, czo>>1)),
-				# shift in the (2,3)-quadrant flag
-				qin.eq((-self.zi < -pi2) | (self.zi+1 < -pi2)),
-				# need to remap xo/yo quadrants (2,3) -> (4,1)
-				If(qout,
-					self.xo.eq(-cxo),
-					self.yo.eq(-cyo),
-				)]
-
-class TB(Module):
-	def __init__(self, n, **kwargs):
-		self.submodules.cordic = Cordic(**kwargs)
-		self.xi = [.9/self.cordic.gain] * n
-		self.yi = [0] * n
-		self.zi = [2*i/n-1 for i in range(n)]
-		self.xo = []
-		self.yo = []
-		self.zo = []
-
-	def do_simulation(self, s):
-		c = 2**(flen(self.cordic.xi)-1)
-		if s.rd(self.cordic.fresh):
-			self.xo.append(s.rd(self.cordic.xo))
-			self.yo.append(s.rd(self.cordic.yo))
-			self.zo.append(s.rd(self.cordic.zo))
-			if not self.xi:
-				s.interrupt = True
-				return
-			for r, v in zip((self.cordic.xi, self.cordic.yi, self.cordic.zi),
-					(self.xi, self.yi, self.zi)):
-				s.wr(r, int(v.pop(0)*c))
-
-def _main():
-	from migen.fhdl import verilog
-	from migen.sim.generic import Simulator, TopLevel
-	from matplotlib import pyplot as plt
-	import numpy as np
-
-	c = Cordic(width=16, eval_mode="iterative",
-		cordic_mode="rotate", func_mode="circular")
-	print(verilog.convert(c, ios={c.xi, c.yi, c.zi, c.xo,
-		c.yo, c.zo}))
-
-	n = 200
-	tb = TB(n, width=8, guard=3, eval_mode="pipelined",
-			cordic_mode="rotate", func_mode="circular")
-	sim = Simulator(tb, TopLevel("cordic.vcd"))
-	sim.run(n*16+20)
-	plt.plot(tb.xo)
-	plt.plot(tb.yo)
-	plt.plot(tb.zo)
-	plt.show()
-
-def _rms_err(width, stages, n):
-	from migen.sim.generic import Simulator
-	import numpy as np
-	import matplotlib.pyplot as plt
-
-	tb = TB(width=int(width), stages=int(stages), n=n,
-			eval_mode="combinatorial")
-	sim = Simulator(tb)
-	sim.run(n+100)
-	z = tb.cordic.zmax*(np.arange(n)/n*2-1)
-	x = np.cos(z)*.9
-	y = np.sin(z)*.9
-	dx = tb.xo[1:]-x*2**(width-1)
-	dy = tb.yo[1:]-y*2**(width-1)
-	return ((dx**2+dy**2)**.5).sum()/n
-
-def _test_err():
-	from matplotlib import pyplot as plt
-	import numpy as np
-
-	widths, stages = np.mgrid[4:33:1, 4:33:1]
-	err = np.vectorize(lambda w, s: rms_err(w, s, 173))(widths, stages)
-	err = -np.log2(err)/widths
-	print(err)
-	plt.contour(widths, stages, err, 50, cmap=plt.cm.Greys)
-	plt.plot(widths[:, 0], stages[0, np.argmax(err, 1)], "bx-")
-	print(widths[:, 0], stages[0, np.argmax(err, 1)])
-	plt.colorbar()
-	plt.grid("on")
-	plt.show()
-
-if __name__ == "__main__":
-	_main()
-	#_rms_err(16, 16, 345)
-	#_test_err()
+				cxi.eq(self.xi),
+				cyi.eq(self.yi),
+				czi.eq(self.zi),
+				If(rot,
+					cxi.eq(-self.xi),
+					cyi.eq(-self.yi),
+					czi.eq(self.zi + 2*pi2),
+					#czi.eq(self.zi ^ (2*pi2)),
+				),
+				]
--- a/migen/test/test_cordic.py
+++ b/migen/test/test_cordic.py
@ -0,0 +1,163 @@
+import unittest
+from random import randrange, random
+from math import *
+
+from migen.fhdl.std import *
+from migen.genlib.cordic import *
+
+from migen.test.support import SimCase, SimBench
+
+
+class CordicCase(SimCase, unittest.TestCase):
+	class TestBench(SimBench):
+		def __init__(self, **kwargs):
+			k = dict(width=8, guard=None, stages=None,
+				eval_mode="combinatorial", cordic_mode="rotate",
+				func_mode="circular")
+			k.update(kwargs)
+			self.submodules.dut = Cordic(**k)
+
+	def _run_io(self, n, gen, proc, delta=1, deltaz=1):
+		c = 2**(flen(self.tb.dut.xi) - 1)
+		g = self.tb.dut.gain
+		zm = self.tb.dut.zmax
+		pipe = {}
+		genn = [gen() for i in range(n)]
+		def cb(tb, s):
+			if s.rd(tb.dut.new_in):
+				if genn:
+					xi, yi, zi = genn.pop(0)
+				else:
+					s.interrupt = True
+					return
+				xi = floor(xi*c/g)
+				yi = floor(yi*c/g)
+				zi = floor(zi*c/zm)
+				s.wr(tb.dut.xi, xi)
+				s.wr(tb.dut.yi, yi)
+				s.wr(tb.dut.zi, zi)
+				pipe[s.cycle_counter] = xi, yi, zi
+			if s.rd(tb.dut.new_out):
+				t = s.cycle_counter - tb.dut.latency - 1
+				if t < 1:
+					return
+				xi, yi, zi = pipe.pop(t)
+				xo, yo, zo = proc(xi/c, yi/c, zi/c*zm)
+				xo = floor(xo*c*g)
+				yo = floor(yo*c*g)
+				zo = floor(zo*c/zm)
+				xo1 = s.rd(tb.dut.xo)
+				yo1 = s.rd(tb.dut.yo)
+				zo1 = s.rd(tb.dut.zo)
+				print((xi, yi, zi), (xo, yo, zo), (xo1, yo1, zo1))
+				self.assertAlmostEqual(xo, xo1, delta=delta)
+				self.assertAlmostEqual(yo, yo1, delta=delta)
+				self.assertAlmostEqual(abs(zo - zo1) % (2*c), 0, delta=deltaz)
+		self.run_with(cb)
+
+	def test_rot_circ(self):
+		def gen():
+			ti = 2*pi*random()
+			r = random()*.98
+			return r*cos(ti), r*sin(ti), (2*random() - 1)*pi
+		def proc(xi, yi, zi):
+			xo = cos(zi)*xi - sin(zi)*yi
+			yo = sin(zi)*xi + cos(zi)*yi
+			return xo, yo, 0
+		self._run_io(50, gen, proc, delta=2)
+
+	def test_rot_circ_16(self):
+		self.setUp(width=16)
+		self.test_rot_circ()
+
+	def test_rot_circ_pipe(self):
+		self.setUp(eval_mode="pipelined")
+		self.test_rot_circ()
+
+	def test_rot_circ_iter(self):
+		self.setUp(eval_mode="iterative")
+		self.test_rot_circ()
+
+	def _test_vec_circ(self):
+		def gen():
+			ti = pi*(2*random() - 1)
+			r = .98 #*random()
+			return r*cos(ti), r*sin(ti), 0 #pi*(2*random() - 1)
+		def proc(xi, yi, zi):
+			return sqrt(xi**2 + yi**2), 0, zi + atan2(yi, xi)
+		self._run_io(50, gen, proc)
+
+	def test_vec_circ(self):
+		self.setUp(cordic_mode="vector")
+		self._test_vec_circ()
+
+	def test_vec_circ_16(self):
+		self.setUp(width=16, cordic_mode="vector")
+		self._test_vec_circ()
+
+	def _test_rot_hyp(self):
+		def gen():
+			return .6, 0, 2.1*(random() - .5)
+		def proc(xi, yi, zi):
+			xo = cosh(zi)*xi - sinh(zi)*yi
+			yo = sinh(zi)*xi + cosh(zi)*yi
+			return xo, yo, 0
+		self._run_io(50, gen, proc, delta=2)
+
+	def test_rot_hyp(self):
+		self.setUp(func_mode="hyperbolic")
+		self._test_rot_hyp()
+
+	def test_rot_hyp_16(self):
+		self.setUp(func_mode="hyperbolic", width=16)
+		self._test_rot_hyp()
+
+	def test_rot_hyp_iter(self):
+		self.setUp(cordic_mode="rotate", func_mode="hyperbolic",
+				eval_mode="iterative")
+		self._test_rot_hyp()
+
+	def _test_vec_hyp(self):
+		def gen():
+			xi = random()*.6 + .2
+			yi = random()*xi*.8
+			return xi, yi, 0
+		def proc(xi, yi, zi):
+			return sqrt(xi**2 - yi**2), 0, atanh(yi/xi)
+		self._run_io(50, gen, proc)
+
+	def test_vec_hyp(self):
+		self.setUp(cordic_mode="vector", func_mode="hyperbolic")
+		self._test_vec_hyp()
+
+	def _test_rot_lin(self):
+		def gen():
+			xi = 2*random() - 1
+			if abs(xi) < .01:
+				xi = .01
+			yi = (2*random() - 1)*.5
+			zi = (2*random() - 1)*.5
+			return xi, yi, zi
+		def proc(xi, yi, zi):
+			return xi, yi + xi*zi, 0
+		self._run_io(50, gen, proc)
+
+	def test_rot_lin(self):
+		self.setUp(func_mode="linear")
+		self._test_rot_lin()
+
+	def _test_vec_lin(self):
+		def gen():
+			yi = random()*.95 + .05
+			if random() > 0:
+				yi *= -1
+			xi = abs(yi) + random()*(1 - abs(yi))
+			zi = 2*random() - 1
+			return xi, yi, zi
+		def proc(xi, yi, zi):
+			return xi, 0, zi + yi/xi
+		self._run_io(50, gen, proc, deltaz=2, delta=2)
+
+	def test_vec_lin(self):
+		self.setUp(func_mode="linear", cordic_mode="vector", width=8)
+		self._test_vec_lin()