prototype bytecode interpreter

2023-02-05 11:44:37 +00:00 · 2023-02-05 11:44:37 +00:00 · f63d6cdd3d
commit f63d6cdd3d
8 changed files with 1827 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+test_creole
--- a/LICENSE.md
+++ b/LICENSE.md
@ -0,0 +1,14 @@
+Copyright (c) 2023 Peter McGoron <code@mcgoron.com>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+test_creole: test_creole.c creole.c creole.h greatest.h
+	$(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole
--- a/README.md
+++ b/README.md
@ -0,0 +1,23 @@
+Creole is a bytecode designed for simple implementations.
+
+## Bytecode Format
+
+Each creole line consists of pseudo-UTF-8 characters. The first byte
+is an unsigned number between 0 and 127 (the high bit is clear). Each
+suceeding pseudo-UTF-8 character is encoded as follows:
+
+* `110xxxxx 10xxxxxx`
+* `1110xxxx 10xxxxxx 10xxxxxx`
+* `11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
+* `111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
+* `1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	10xxxxxx`
+* `11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
+
+The first four bytes determine the type:
+
+* `0`: Value is a register.
+* `1`: Value is immediate.
+
+All other values are reserved. Overlong values are allowed, and for some
+argument values they are necessary. All lines are terminated by a byte
+of all zeros.
--- a/creole.c
+++ b/creole.c
@ -0,0 +1,410 @@
+#include "creole.h"
+
+/*************************************************************************
+ * Static information
+ ************************************************************************/
+
+/* Arguments to opcodes can accept the following:
+   * immediate values only (as of now, no values are like this)
+   * register values only (push, pop, etc.)
+   * either values (math operations)
+   * labels (jumps)
+   * none (do not give an argument)
+ */
+enum creole_arg_type {
+	TYPE_NONE,
+	TYPE_IMM,
+	TYPE_REG,
+	TYPE_VAL,
+	TYPE_LAB,
+	CREOLE_ARG_TYPE_LEN
+};
+
+/* C99+ allows for designating the array index when initializing arrays:
+      [i] = v,
+ * in C89 indicies are implicit from 0 to the maximum filled-in value.
+ */
+#define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}}
+static const struct {
+	unsigned arglen;
+	enum creole_arg_type argtype[CREOLE_MAX_ARG];
+} opcode_info[CREOLE_OPCODE_LEN] = {
+	defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE),
+	defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
+	defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE),
+	defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
+	defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
+	defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
+	defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
+	defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE),
+	defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE)
+};
+
+/*************************************************************************
+ * Reading from the buffer
+ ************************************************************************/
+
+static int read(struct creole_reader *r)
+{
+	if (r->left == 0)
+		return -1;
+	r->left--;
+	return *r->p++;
+}
+
+static int read_eof(struct creole_reader *r)
+{
+	return r->left == 0;
+}
+
+#if 0
+
+/*************************************************************************
+ * Pseudo-UTF-8 lexing
+ *
+ * Pseudo-UTF-8 is based off of UTF-8 but adds more
+ * bytes and allows (requires!) overlong encodings.
+ *
+ * Possible values:
+ *   0xxxxxxx                                              (7 bits)
+ *   110xxxxx 10xxxxxx                                     (11 bits)
+ *   1110xxxx 10xxxxxx 10xxxxxx                            (16 bits)
+ *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx                   (21 bits)
+ *   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx          (26 bits)
+ *   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
+ *   11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
+ *            10xxxxxx
+ ************************************************************************/
+
+/* A Psuedo-UTF-8 sequence can be either
+ *
+ * * A 1 byte sequence, where the lower 7 bits are the encoded
+ * * word (no high bits).
+
+ * * A multi-byte sequence where the 4 MSB are flags, and the
+ * * lower bits are the encoded word.
+ */
+struct word {
+	int len;
+	int high_bits;
+	creole_word word;
+};
+
+/* Decode a set of continuation bytes directly into the word. This assumes
+ * that each continuation byte contains no high words.
+ */
+static int read_continue(struct creole_reader *r, struct encoded_word *w,
+                         int to_read)
+{
+	int i;
+	int r_ret;
+	unsigned char c;
+
+	for (i = 0; i < to_read) {
+		r_ret = read(r);
+		if (r_ret < 0)
+			return 0;
+		/* Characters might not be 8 bits! */
+		c = (unsigned char)(r_ret & 0xFF);
+		if (c >> 6 != 0x2)
+			return 0;
+		w->word = w->word << 6 | (c & 0x6);
+	}
+
+	return 1;
+}
+
+/* Start bytes must be treated differently. Depending on the scenario,
+ * start bytes will contain parts of the encoded word and high-bit flags.
+ * In some cases, not all of the high-bit flags are part of the start
+ * byte.
+ */
+#define START_BYTE_NUM 7
+static int parse_start_byte(unsigned char c, struct word *w)
+{
+	static const struct {
+		/* The algorithm compares the mask to the start byte
+		 * by shifting both to the right by the amount of 'x's
+		 * (blank spaces). The array is arranged in reverse
+		 * order so that the index indicates the amount of
+		 * bits to shift.
+		 */
+		unsigned char mask;
+		/* The word bits, if they exist, always start from the
+		 * LSB, so there is no need to shift the bits away. The
+		 * word_mask gets the low bits. If there are no bits, set
+		 * to 0.
+		 */
+		unsigned char word_mask;
+
+		/* The high bits may not start from the LSB. There needs
+		 * to be a shift to get the bits to the LSB, and a mask
+		 * to discard the higher bits.
+		 */
+		unsigned char high_bit_mask;
+		int high_bit_shift;
+
+		/* The amount of NORMAL continuation bytes to read.
+		 * This does NOT include continuation bytes that have
+		 * high-bit flags in them.
+		 */
+		int to_read;
+	} start_data[START_BYTE_NUM] {
+		{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
+		{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
+		{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
+		{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
+		{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
+		{0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
+		/* The single byte sequence has no high bits. */
+		{0x00, 0x7F, 0, 0x0, 0}  /* 0xxxxxxx */,
+	};
+
+	int i;
+
+	for (i = 0; i < START_BYTE_NUM; i++) {
+		if (c >> i == start_data[i].mask >> i) {
+			w->len = START_BYTE_NUM - i;
+			w->word = c & start_data[i].word_mask;
+			w->high_bits = (c >> start_data[i].high_bit_shift)
+			             & start_data[i].high_bit_mask;
+			return start_data[i].to_read;
+		}
+	}
+
+	return -1;
+}
+
+/* This parses the first continuation byte if it is special. */
+#define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3)
+static void parse_special_byte(unsigned char c, struct word *w)
+{
+	/* The index denotes the amount of high bits that were in
+	 * the start byte. This is the amount that the stored value
+	 * must be shifted.
+	 *
+	 * The amount of bits that must be shifted out in the continue
+	 * byte increase with the index. The amount shifted is (i + 2).
+	 *
+ 	 * Each value stored in the array is the mask applied after
+	 * shifting the continue byte bits.
+	 */
+	static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = {
+		0xF, /* 1111110 10HHHHxx */
+		0x7, /* 111110H 10HHHxxx */
+		0x3, /* 11110HH 10HHxxxx */
+		0x1  /* 1110HHH 10Hxxxxx */
+	};
+	int i = w->len - START_BYTE_NUM;
+	w->high_bits = (w->high_bits << i) | ((c >> (2 + i)) & mask[i]);
+}
+
+/* Parse an entire Pseudo-UTF8 sequence. */
+static int decode_seq(struct creole_reader *r, struct word *w)
+{
+	int r_ret;
+	unsigned char c;
+	int to_read;
+
+	r_ret = read(r);
+	if (r_ret < 0)
+		return 0;
+
+	to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w);
+	if (to_read < 0)
+		return 0;
+
+	/* If to_read is not one less than w->len, that means there are
+	 * high bits in the first continuation byte.
+	 */
+	if (w->len - to_read > 1) {
+		r_ret = read(r);
+		if (r_ret < 0)
+			return 0;
+		parse_special_byte((unsigned char)(r_ret & 0xFF), w);
+	}
+
+	return read_continue(r, decoded_word, to_read);
+}
+
+/*************************************************************************
+ * Parsing instructions
+ *
+ * This parses an entire instruction, which is
+ *  a single byte sequence,
+ *  zero or more multibyte sequences,
+ *  one single byte of all zeros.
+ *************************************************************************/
+
+int creole_parse_line(struct creole_ins *ins, struct creole_reader *r)
+{
+	struct word w = {0};
+	unsigned arg = 0;
+
+	if (!decode_seq(r, &w))
+		return 0;
+
+	ins->opcode = w.word;
+	if (w.word < CREOLE_ARG_TYPE_LEN || w.len != 1)
+		return 0;
+
+	for (arg = 0; arg < arglen; arg++) {
+		if (!decode_seq(r, &w))
+			return 0;
+		if (w.len == 1)
+			return 0;
+		ins->w[arg] = w.word;
+		ins->w_flags[arg] = w.high_bits;
+	}
+
+	if (!decode_seq(r, &w))
+		return 0;
+	if (w.word != 0 || w.len != 1)
+		return 0;
+	return 1;
+}
+
+/**************************************************************************
+ * High level compiling interface
+ *************************************************************************/
+
+static void clear_instruction(struct creole_env *env,
+                              struct creole_ins *ins)
+{
+	memset(ins, 0, sizeof(ins));
+	env->prgptr--;
+}
+
+static int typecheck(enum creole_word_flag fl, enum creole_arg_type typ)
+{
+	switch (typ) {
+	case TYPE_NONE: return 0;
+	case TYPE_IMM: return fl == CREOLE_IMMEDIATE;
+	case TYPE_REG: return fl == CREOLE_REGISTER;
+	case TYPE_VAL: return fl == CREOLE_IMMEDIATE
+	                    | fl == CREOLE_REGISTER;
+	case TYPE_LAB: return fl == CREOLE_IMMEDIATE;
+	default: return 0;
+	}
+}
+
+static enum creole_compiler_ret typecheck_ins(struct creole_env *env,
+                                              struct creole_ins *ins)
+{
+	unsigned i;
+
+	for (i = 0; i < opcode_info[env->opcode].arglen; i++) {
+		if (!typecheck(ins->w[i],
+		               opcode_info[env->opcode].argtype[i]))
+			return CREOLE_TYPE_ERROR;
+	}
+	return CREOLE_COMPILE_OK;
+}
+
+static enum creole_compiler_ret
+handle_compiletime_immediate(struct creole_env *env,
+                             struct creole_ins *ins)
+{
+	switch (ins->opcode) {
+	case CREOLE_CLB:
+		if (ins->w[0] >= ins->lablen)
+			return CREOLE_LABEL_OVERFLOW;
+		ins->lab[ins->w[0]] = env->prgptr;
+		/* Delete instruction because it is a compile time
+		 * instruction. Place next instruction in its place. */
+		clear_instruction(env, ins);
+		return CREOLE_COMPILE_OK;
+	case CREOLE_NOOP:
+		clear_instruction(env, ins);
+		return CREOLE_COMPILE_OK;
+	default:
+		return typecheck_ins(env, ins);
+	}
+}
+
+int creole_compile(struct creole_env *env, struct creole_reader *r)
+{
+	struct creole_ins *cur_ins = env->prg;
+	int rcode;
+
+	while (env->prgptr < env->prglen) {
+		if (!creole_parse_line(cur_ins, r))
+			return CREOLE_PARSE_ERROR;
+		/* Increase prgptr here. If the instruction is a compile
+		 * time instruction, then this will be decremented since
+		 * the instruction will not be executed.
+		 */
+		env->prgptr++;
+
+		rcode = handle_compiletime_immediate(env, cur_ins);
+		if (rcode != CREOLE_COMPILE_OK)
+			return rcode;
+
+		if (read_eof(r))
+			break;
+		cur_ins += 1;
+	}
+
+	if (env->prgptr == env->prglen && *line)
+		return CREOLE_PROGRAM_OVERFLOW;
+	env->prgend = env->prgptr;
+	env->prgptr = 0;
+	return CREOLE_COMPILE_OK;
+}
+
+static creole_word read_word(struct creole_ins *ins, int i)
+{
+	if (env->w_flags[i] == CREOLE_REGISTER)
+		return env->reg[env->w[i]];
+	else
+		return env->w[i];
+}
+
+int creole_step(struct creole_env *env)
+{
+	struct creole_ins *ins = env->prg + env->prgptr;
+	creole_word a1, a2;
+
+	if (env->prgptr == env->prgend)
+		return CREOLE_STEP_STOP;
+	env->prgptr++;
+
+	switch (ins->opcode) {
+	case CREOLE_PUSH:
+		if (env->stkptr == env->stklen)
+			return CREOLE_STACK_OVERFLOW;
+		env->stk[env->stkptr++] = env->reg[env->w[0]];
+		break;
+	case CREOLE_POP:
+		if (env->stkptr == 0)
+			return CREOLE_STACK_OVERFLOW;
+		env->reg[env->w[0]] = env->stk[--env->stkptr];
+		break;
+	case CREOLE_ADD:
+		a1 = read_word(ins, 1);
+		a2 = read_word(ins, 2);
+		env->reg[env->w[0]] = a1 + a2;
+		break;
+	case CREOLE_MUL:
+		a1 = read_word(ins, 1);
+		a2 = read_word(ins, 2);
+		env->reg[env->w[0]] = a1 * a2;
+		break;
+	case CREOLE_DIV:
+		a1 = read_word(ins, 1);
+		a2 = read_word(ins, 2);
+		env->reg[env->w[0]] = a1 / a2;
+		break;
+	case CREOLE_JL:
+		a1 = read_word(ins, 1);
+		a2 = read_word(ins, 2);
+		if (a1 < a2)
+			env->prgptr = env->lab[env->w[0]];
+		break;
+	case SYS:
+		a1 = read_word(ins, 1);
+		/* do syscall */
+		break;
+	}
+}
+#endif
--- a/creole.h
+++ b/creole.h
@ -0,0 +1,67 @@
+#ifndef CREOLE_H
+#define CREOLE_H
+
+#include <limits.h>
+
+#ifndef CREOLE_WORD
+# define CREOLE_WORD unsigned int
+# define CREOLE_WORD_MAX UINT_MAX
+#endif
+
+#define CREOLE_MAX_ARG 3
+
+typedef CREOLE_WORD creole_word;
+
+enum creole_opcode {
+	CREOLE_NOOP = 0,
+	CREOLE_PUSH = 1,
+	CREOLE_POP = 2,
+	CREOLE_ADD = 3,
+	CREOLE_MUL = 4,
+	CREOLE_DIV = 5,
+	CREOLE_JL = 6,
+	CREOLE_CLB = 7,
+	CREOLE_SYS = 8,
+	CREOLE_OPCODE_LEN
+};
+
+enum creole_word_flag {
+	CREOLE_IMMEDIATE,
+	CREOLE_REGISTER,
+	CREOLE_WORD_FLAGS_LEN
+};
+
+enum creole_compiler_ret {
+	CREOLE_COMPILE_OK,
+	CREOLE_COMPILE_PARSE_ERROR,
+	CREOLE_LABEL_OVERFLOW,
+	CREOLE_TYPE_ERROR,
+	CREOLE_COMPILE_RET_LEN
+};
+
+struct creole_ins {
+	enum creole_opcode opcode;
+	unsigned char w_flags[3];
+	creole_word w[3];
+};
+
+struct creole_env {
+	creole_word *reg;
+	size_t reglen;
+
+	size_t *lab;
+	size_t lablen;
+
+	creole_word *stk;
+	size_t stkptr, stklen;
+
+	struct creole_ins *prg;
+	size_t prgptr, prgend, prglen;
+};
+
+struct creole_reader {
+	unsigned char *p;
+	size_t left;
+};
+
+#endif /* CREOLE_H */
--- a/greatest.h
+++ b/greatest.h
--- a/test_creole.c
+++ b/test_creole.c
@ -0,0 +1,44 @@
+#include "greatest.h"
+GREATEST_MAIN_DEFS();
+#include "creole.c"
+
+/**************************************************************************
+ * Reader suite
+ *************************************************************************/
+#define reader_lit(r, s) do {     \
+	r.p = (unsigned char *)s; \
+	r.left = sizeof(s) - 1;   \
+} while(0)
+
+TEST reader_test_basic(struct creole_reader *r) {
+	size_t i = 0;
+	unsigned char *s = r->p;
+	size_t len = r->left;
+
+	for (i = 0; i < len; i++) {
+		ASSERT_EQ(read_eof(r), 0);
+		ASSERT_EQ(read(r), s[i]);
+	}
+
+	for (i = 0; i < 5; i++) {
+		ASSERT_EQ(read_eof(r), 1);
+		ASSERT_EQ(read(r), -1);
+	}
+	PASS();
+}
+
+SUITE(reader) {
+	struct creole_reader r = {0};
+
+	reader_lit(r, "abcdefg");
+	RUN_TEST1(reader_test_basic, &r);
+
+	reader_lit(r, "");
+	RUN_TEST1(reader_test_basic, &r);
+}
+
+int main(int argc, char *argv[]) {
+	GREATEST_MAIN_BEGIN();
+	RUN_SUITE(reader);
+	GREATEST_MAIN_END();
+}