From 3dd29b100483cb9eb34d96f6ed21827e86cd77a2 Mon Sep 17 00:00:00 2001
From: Peter McGoron <code@mcgoron.com>
Date: Mon, 20 Feb 2023 18:45:21 +0000
Subject: [PATCH] add label parser that can deal with forward jumps; all tests
 pass

---
 asm/creole.py | 152 +++++++++++++++++++++++++++++++++-----------------
 asm/test.py   |  98 ++++++++++++--------------------
 2 files changed, 138 insertions(+), 112 deletions(-)

diff --git a/asm/creole.py b/asm/creole.py
index 559d663..a614701 100644
--- a/asm/creole.py
+++ b/asm/creole.py
@@ -71,7 +71,7 @@ class Argument:
 
 class StringArgument(Argument):
 	def __init__(self, *args, **kwargs):
-		super().__init__(self, *args, **kwargs)
+		super().__init__(*args, **kwargs)
 	def __bytes__(self):
 		b = bytes()
 		for v in self.val:
@@ -80,9 +80,23 @@ class StringArgument(Argument):
 
 class LabelArgument(Argument):
 	def __init__(self, *args, **kwargs):
-		super().__init__(self, *args, **kwargs)
-	def load_label(self, labels):
-		self.val = labels[val]
+		super().__init__(*args, **kwargs)
+	def __call__(self):
+		return self.val
+
+class TypecheckException(Exception):
+	""" Exception thrown when an argument to an instruction are of the
+	    incorrect type. """
+	def __init__(self, got, argtype, sarg, opcode, i):
+		self.argtype = argtype
+		self.sarg = sarg
+		self.opcode = opcode
+		self.got = got
+		self.i = i
+		self.message = f'''\
+opcode {self.opcode.name} has invalid value {self.sarg} ({self.got})
+at {self.i} (expected {self.argtype})\
+'''
 
 class ArgType(Enum):
 	""" Class denoting the type of an argument to an instruction. """
@@ -135,27 +149,29 @@ class ArgType(Enum):
 		elif s[0] == 'd' and s[1:].isnumeric():
 			return Argument(ArgType.DAT, int(s[1:]))
 		elif s[0] == '.':
-			return Argument(ArgType.LAB, s[1:])
+			return LabelArgument(ArgType.LAB, s[:])
 		else:
 			raise MalformedArgument(s)
 
-	def typecheck(self, s):
+	def typecheck(self, s, opcode, i):
 		""" Parses the type of the string and returns it if it fits
 		    the type of the enum value.
 
 		    :param s: String argument representing an argument.
-		    :return: The Argument class containing the object, or None
-		    if the string does not fit the type of self. """
+		    :param opcode: Opcode of the argument. Used for debugging.
+		    :param i: Argument number. Used for debugging.
+		    :return: The Argument class containing the object.
+		    :raises TypecheckException: """
 		t = ArgType.gettype(s)
 		if self == ArgType.VAL:
 			if t.at == ArgType.REG or t.at == ArgType.IMM:
 				return t
 			else:
-				return None
+				raise TypecheckException(t.at, self, s, opcode, i)
 		elif t.at == self:
 			return t
 		else:
-			return None
+			raise TypecheckException(t.at, self, s, opcode, i)
 
 class OpcodeException(Exception):
 	pass
@@ -168,20 +184,7 @@ class TypecheckLenException(Exception):
 		self.argtypelen = argtypelen
 	def __str__(self):
 		return f'''\
-arguments {self.insargs} to opcode {self.opcode} not of length {self.argtypelen}\
-'''
-class TypecheckException(Exception):
-	""" Exception thrown when an argument to an instruction are of the
-	    incorrect type. """
-	def __init__(self, argtype, sarg, i, opcode):
-		self.argtype = argtype
-		self.sarg = sarg
-		self.i = i
-		self.opcode = opcode
-	def __str__(self):
-		return f'''\
-opcode {self.opcode} has invalid value {self.sarg}
-(expected {self.argtype} in position {self.i})\
+arguments {self.insargs} to opcode {self.opcode.name} not of length {self.argtypelen}\
 '''
 
 class Instruction(Enum):
@@ -248,19 +251,14 @@ class Instruction(Enum):
 		    :param sargs: List of arguments to the instruction
 		    as strings.
 		    :return: List of arguments (as Argument objects).
-		    :raises TypeCheckException:
 		    :raises TypecheckLenException:
 		"""
 		rargs = []
 		if len(sargs) != len(self.argtypes):
-			raise TypecheckLenException(self.opcode, sargs,
+			raise TypecheckLenException(self, sargs,
 			                            len(self.argtypes))
 		for i in range(0, len(sargs)):
-			t = self.argtypes[i].typecheck(sargs[i])
-			if t is None:
-				raise TypecheckException(self.argtypes[i],
-				                         sargs[i],
-				                         i, self.opcode)
+			t = self.argtypes[i].typecheck(sargs[i], self, i)
 			rargs.append(t)
 		return rargs
 
@@ -280,10 +278,11 @@ class Instruction(Enum):
 		return Instruction[self.opcode].render(args)
 
 	def _render_default(self, args):
-		b = bytes([self.opcode])
+		comps = [bytes([self.opcode])]
 		for a in args:
-			b = b + a()
-		return b + bytes([0])
+			comps.append(a())
+		comps.append(b'\x00')
+		return comps
 
 encoding_types = {
 #           start      mask   B
@@ -296,6 +295,12 @@ encoding_types = {
 #	B : Total number of bits excluding high bits
 }
 
+def pseudo_utf8_len(n):
+	for k in sorted(encoding_types):
+		if n <= encoding_types[k][0]:
+			return k
+	return None
+
 class InvalidNumberException(Exception):
 	pass
 class InvalidLengthException(Exception):
@@ -304,10 +309,7 @@ def encode_pseudo_utf8(n, high_bits, to):
 	if n < 0:
 		raise InvalidNumberException(n)
 	if to is None or to < 0:
-		for k in sorted(encoding_types):
-			if n <= encoding_types[k][0]:
-				to = k
-				break
+		to = pseudo_utf8_len(n)
 		if to is None:
 			raise InvalidNumberException(n)
 	if to > 8 or to < 0:
@@ -343,27 +345,26 @@ class Line:
 					raise RangeCheckException(a.at,
 					                          a.val,
 					                          reglen)
-
-	def load_label(self, labels):
-		for a in self.args:
-			if a.at == ArgType.LAB:
-				a.load_label(labels)
 	def __call__(self):
 		return self.ins.render(self.args)
 
 class InstructionNotFoundException(Exception):
 	pass
 class Program:
-	def asm_push_line(self, ins, args):
+	def _asm_push_line(self, ins, args):
 		l = Line(ins, args)
 		l.check_line(self.lablen, self.reglen)
 		self.asm.append(l)
 
 	def parse_asm_line(self, line):
+		""" Parse and add a single assembly line to the program.
+		:param line: String containing the line.
+		:raises InstructionNotFoundException:
+		"""
 		line = line.strip().split()
 		line[0] = line[0].casefold()
 		if line[0][0] == '.':
-			self.asm.append(line[0][1:])
+			self.asm.append(line[0])
 			return None
 
 		try:
@@ -372,21 +373,72 @@ class Program:
 			raise InstructionNotFoundException(line[0])
 
 		args_w_type = ins.typecheck(line[1:])
-		self.asm_push_line(ins, args_w_type)
+		self._asm_push_line(ins, args_w_type)
 
 	def parse_lines(self, lines):
+		""" Parse a list of lines. See parse_asm_line.
+		    :param lines: List of assembly lines.
+		"""
 		for l in lines:
 			self.parse_asm_line(l)
 
 	def __call__(self):
-		b = bytes()
+		""" Generate bytecode. """
+
+		# Labels may jump forward in the program, which means
+		# multiple passes are required to properly calculate
+		# jump locations.
+		# This algorithm makes every jump destination the same
+		# width in each operation, and calculates the smallest
+		# width that will allow all labels to jump to any location
+		# in the program.
+		# The algorithm calculates the length of the program
+		# with all jump arguments given a length of 0. Each label
+		# is noted with its offset in the program (with all jump
+		# arguments given zero length) and the amount of jump arguments
+		# that occur prior to the label.
+		# When the code is emitted, the label length is properly
+		# calculated with the length of each label.
+		# This method is not optimal, but will work well for small
+		# programs.
+
+		ins = []
+		curlen = 0
+
+		# This dictonary contains a tuple (len, refs)
+		# that denotes that a label points to len + lablen*refs
+		# where lablen is a to-be-determined number.
 		labels = {}
+		labelrefs = 0
 		for line in self.asm:
 			if type(line) is str:
-				labels[line] = len(b)
+				labels[line] = (curlen, labelrefs)
 				continue
-			line.load_label(labels)
-			b = b + line()
+
+			next_ins = line()
+			for v in next_ins:
+				if type(v) is str:
+					labelrefs += 1
+				else:
+					curlen += len(v)
+			ins.append(next_ins)
+
+		# Calculate a label length, such that the entire program
+		# can be contained in this length.
+		for i in encoding_types:
+			if curlen + labelrefs*i < encoding_types[i][0]:
+				lablen = i
+				break
+
+		# Emit bytecode.
+		b = bytes()
+		for line in ins:
+			for arg in line:
+				if type(arg) is str:
+					off = labels[arg][0] + labels[arg][1]*lablen
+					arg = encode_pseudo_utf8(off, 0, lablen)
+				b = b + arg
+		assert len(b) < encoding_types[lablen][0]
 		return b
 
 	def __init__(self, lablen=16, reglen=16):
diff --git a/asm/test.py b/asm/test.py
index f2c2277..cae3d36 100644
--- a/asm/test.py
+++ b/asm/test.py
@@ -30,17 +30,17 @@ class PushTest(unittest.TestCase):
 	def test_parse_push_catch_typecheck_push_lab(self):
 		p = Program()
 		with self.assertRaises(TypecheckException) as cm:
-			p.parse_asm_line("push l0")
+			p.parse_asm_line("push .l0")
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
-		self.assertEqual(cm.exception.sarg, 'l0')
+		self.assertEqual(cm.exception.sarg, '.l0')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 1)
+		self.assertEqual(cm.exception.opcode, Instruction.PUSH)
 
 	def test_parse_push_catch_typecheck_argument_overflow(self):
 		p = Program()
 		with self.assertRaises(TypecheckLenException) as cm:
 			p.parse_asm_line("push r1 r2")
-		self.assertEqual(cm.exception.opcode, 1)
+		self.assertEqual(cm.exception.opcode, Instruction.PUSH)
 		self.assertEqual(cm.exception.insargs, ["r1", "r2"])
 		self.assertEqual(cm.exception.argtypelen, 1)
 
@@ -48,7 +48,7 @@ class PushTest(unittest.TestCase):
 		p = Program()
 		with self.assertRaises(TypecheckLenException) as cm:
 			p.parse_asm_line("push")
-		self.assertEqual(cm.exception.opcode, 1)
+		self.assertEqual(cm.exception.opcode, Instruction.PUSH)
 		self.assertEqual(cm.exception.insargs, [])
 		self.assertEqual(cm.exception.argtypelen, 1)
 
@@ -64,23 +64,6 @@ class PushTest(unittest.TestCase):
 		p.parse_asm_line("PUSH 6")
 
 		ex = ffi.Environment(p())
-		self.assertEqual(ex.cenv.prgend, 2)
-
-		self.assertEqual(ex.cenv.prg[0].opcode, 1)
-		self.assertEqual(ex.cenv.prg[0].w_flags[0], 1)
-		self.assertEqual(ex.cenv.prg[0].w_flags[1], 0)
-		self.assertEqual(ex.cenv.prg[0].w_flags[2], 0)
-		self.assertEqual(ex.cenv.prg[0].w[0], 0)
-		self.assertEqual(ex.cenv.prg[0].w[1], 0)
-		self.assertEqual(ex.cenv.prg[0].w[2], 0)
-
-		self.assertEqual(ex.cenv.prg[1].opcode, 1)
-		self.assertEqual(ex.cenv.prg[1].w_flags[0], 0)
-		self.assertEqual(ex.cenv.prg[1].w_flags[1], 0)
-		self.assertEqual(ex.cenv.prg[1].w_flags[2], 0)
-		self.assertEqual(ex.cenv.prg[1].w[0], 6)
-		self.assertEqual(ex.cenv.prg[1].w[1], 0)
-		self.assertEqual(ex.cenv.prg[1].w[2], 0)
 
 	def test_push_many(self):
 		p = Program()
@@ -107,15 +90,6 @@ class PopTest(unittest.TestCase):
 		b = p()
 		self.assertEqual(b, b'\x02\xC2\x89\x00')
 		ex = ffi.Environment(b)
-		self.assertEqual(ex.cenv.prgend, 1)
-
-		self.assertEqual(ex.cenv.prg[0].opcode, 2)
-		self.assertEqual(ex.cenv.prg[0].w_flags[0], 1)
-		self.assertEqual(ex.cenv.prg[0].w_flags[1], 0)
-		self.assertEqual(ex.cenv.prg[0].w_flags[2], 0)
-		self.assertEqual(ex.cenv.prg[0].w[0], 9)
-		self.assertEqual(ex.cenv.prg[0].w[1], 0)
-		self.assertEqual(ex.cenv.prg[0].w[2], 0)
 
 	def test_compile_throw_pop_literal(self):
 		p = Program()
@@ -124,22 +98,22 @@ class PopTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.REG)
 		self.assertEqual(cm.exception.sarg, '6')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 2)
+		self.assertEqual(cm.exception.opcode, Instruction.POP)
 
 	def test_compile_throw_pop_label(self):
 		p = Program()
 		with self.assertRaises(TypecheckException) as cm:
-			p.parse_asm_line("pop l9")
+			p.parse_asm_line("pop .l9")
 		self.assertEqual(cm.exception.argtype, ArgType.REG)
-		self.assertEqual(cm.exception.sarg, 'l9')
+		self.assertEqual(cm.exception.sarg, '.l9')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 2)
+		self.assertEqual(cm.exception.opcode, Instruction.POP)
 
 	def test_compile_throw_argument_overflow(self):
 		p = Program()
 		with self.assertRaises(TypecheckLenException) as cm:
 			p.parse_asm_line("pop r1 r2")
-		self.assertEqual(cm.exception.opcode, 2)
+		self.assertEqual(cm.exception.opcode, Instruction.POP)
 		self.assertEqual(cm.exception.insargs, ["r1", "r2"])
 		self.assertEqual(cm.exception.argtypelen, 1)
 
@@ -147,7 +121,7 @@ class PopTest(unittest.TestCase):
 		p = Program()
 		with self.assertRaises(TypecheckLenException) as cm:
 			p.parse_asm_line("pop")
-		self.assertEqual(cm.exception.opcode, 2)
+		self.assertEqual(cm.exception.opcode, Instruction.POP)
 		self.assertEqual(cm.exception.insargs, [])
 
 	def test_pop_underflow(self):
@@ -189,7 +163,7 @@ class AddTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.REG)
 		self.assertEqual(cm.exception.sarg, '5')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 3)
+		self.assertEqual(cm.exception.opcode, Instruction.ADD)
 
 	def test_exec_add_throw_lab_1(self):
 		p = Program()
@@ -198,7 +172,7 @@ class AddTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
 		self.assertEqual(cm.exception.sarg, '.label')
 		self.assertEqual(cm.exception.i, 1)
-		self.assertEqual(cm.exception.opcode, 3)
+		self.assertEqual(cm.exception.opcode, Instruction.ADD)
 
 	def test_exec_add_throw_lab_2(self):
 		p = Program()
@@ -207,7 +181,7 @@ class AddTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
 		self.assertEqual(cm.exception.sarg, '.ab')
 		self.assertEqual(cm.exception.i, 2)
-		self.assertEqual(cm.exception.opcode, 3)
+		self.assertEqual(cm.exception.opcode, Instruction.ADD)
 
 class MulTest(unittest.TestCase):
 	def test_exec_mul_imm_imm(self):
@@ -233,25 +207,25 @@ class MulTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.REG)
 		self.assertEqual(cm.exception.sarg, '942')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 4)
+		self.assertEqual(cm.exception.opcode, Instruction.MUL)
 
 	def test_exec_mul_throw_lab_1(self):
 		p = Program()
 		with self.assertRaises(TypecheckException) as cm:
-			p.parse_asm_line("mul r9 l2 1991")
+			p.parse_asm_line("mul r9 .l2 1991")
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
-		self.assertEqual(cm.exception.sarg, 'l2')
+		self.assertEqual(cm.exception.sarg, '.l2')
 		self.assertEqual(cm.exception.i, 1)
-		self.assertEqual(cm.exception.opcode, 4)
+		self.assertEqual(cm.exception.opcode, Instruction.MUL)
 
 	def test_exec_mul_throw_lab_2(self):
 		p = Program()
 		with self.assertRaises(TypecheckException) as cm:
-			p.parse_asm_line("mul r0 -11 l48")
+			p.parse_asm_line("mul r0 -11 .l48")
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
-		self.assertEqual(cm.exception.sarg, 'l48')
+		self.assertEqual(cm.exception.sarg, '.l48')
 		self.assertEqual(cm.exception.i, 2)
-		self.assertEqual(cm.exception.opcode, 4)
+		self.assertEqual(cm.exception.opcode, Instruction.MUL)
 
 class DivTest(unittest.TestCase):
 	def test_div(self):
@@ -295,7 +269,7 @@ class DivTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.REG)
 		self.assertEqual(cm.exception.sarg, '5')
 		self.assertEqual(cm.exception.i, 0)
-		self.assertEqual(cm.exception.opcode, 5)
+		self.assertEqual(cm.exception.opcode, Instruction.DIV)
 
 	def test_exec_div_throw_lab_1(self):
 		p = Program()
@@ -304,7 +278,7 @@ class DivTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
 		self.assertEqual(cm.exception.sarg, '.qqweq')
 		self.assertEqual(cm.exception.i, 1)
-		self.assertEqual(cm.exception.opcode, 5)
+		self.assertEqual(cm.exception.opcode, Instruction.DIV)
 
 	def test_exec_div_throw_lab_2(self):
 		p = Program()
@@ -313,7 +287,7 @@ class DivTest(unittest.TestCase):
 		self.assertEqual(cm.exception.argtype, ArgType.VAL)
 		self.assertEqual(cm.exception.sarg, '.24')
 		self.assertEqual(cm.exception.i, 2)
-		self.assertEqual(cm.exception.opcode, 5)
+		self.assertEqual(cm.exception.opcode, Instruction.DIV)
 
 class LabelTest(unittest.TestCase):
 	def test_unconditional_jump(self):
@@ -321,9 +295,9 @@ class LabelTest(unittest.TestCase):
 		p.parse_lines([
 			"mov r0 5",
 			"mov r0 6",
-			"j l0",
+			"j .l0",
 			"mov r0 7",
-			"CLB l0",
+			".l0",
 		])
 		ex = ffi.Environment(p())
 		self.assertEqual(ex(), ffi.RunRet.STOP)
@@ -334,10 +308,10 @@ class LabelTest(unittest.TestCase):
 		p.parse_lines([
 			"add r0 10 0",
 			"add r1 20 0",
-			"CLB l0",
+			".loop_head",
 			"add r0 r0 -1",
 			"add r1 r1 1",
-			"jl l0 0 r0"
+			"jl .loop_head 0 r0"
 		])
 		ex = ffi.Environment(p())
 		self.assertEqual(ex(), ffi.RunRet.STOP)
@@ -349,10 +323,10 @@ class LabelTest(unittest.TestCase):
 		p.parse_lines([
 			"mov r0 30",
 			"mov r1 0",
-			"CLB l0",
+			".l0",
 			"add r0 r0 -1",
 			"add r1 r1 1",
-			"jls l0 -30 r0"
+			"jls .l0 -30 r0"
 		])
 		ex = ffi.Environment(p())
 		self.assertEqual(ex(), ffi.RunRet.STOP)
@@ -364,11 +338,11 @@ class LabelTest(unittest.TestCase):
 		p.parse_lines([
 			"mov r0 50",
 			"mov r1 0",
-			"CLB l0",
+			".l0",
 			"add r1 r1 1",
 			"mul r2 r0 -1",
 			"add r2 r2 r1",
-			"jne l0 r2 0"
+			"jne .l0 r2 0"
 		])
 		ex = ffi.Environment(p())
 		self.assertEqual(ex(), ffi.RunRet.STOP)
@@ -381,14 +355,14 @@ class LabelTest(unittest.TestCase):
 		p.parse_lines([
 			"mov r0 0", # outer loop counter
 			"mov r2 0", # total iteration counter
-			"CLB l0",
+			".outer_loop",
 			"mov r1 0", # inner loop counter
-			"CLB l1",
+			".inner_loop",
 			"add r1 r1 1",
 			"add r2 r2 1",
-			"jl l1 r1 50",
+			"jl .inner_loop r1 50",
 			"add r0 r0 1",
-			"jl l0 r0 50"
+			"jl .outer_loop r0 50"
 		])
 		ex = ffi.Environment(p())
 		self.assertEqual(ex(), ffi.RunRet.STOP)