prototype bytecode interpreter

This commit is contained in:
Peter McGoron 2023-02-05 11:44:37 +00:00
commit f63d6cdd3d
8 changed files with 1827 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
test_creole

14
LICENSE.md Normal file
View File

@ -0,0 +1,14 @@
Copyright (c) 2023 Peter McGoron <code@mcgoron.com>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

2
Makefile Normal file
View File

@ -0,0 +1,2 @@
test_creole: test_creole.c creole.c creole.h greatest.h
$(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole

23
README.md Normal file
View File

@ -0,0 +1,23 @@
Creole is a bytecode designed for simple implementations.
## Bytecode Format
Each creole line consists of pseudo-UTF-8 characters. The first byte
is an unsigned number between 0 and 127 (the high bit is clear). Each
suceeding pseudo-UTF-8 character is encoded as follows:
* `110xxxxx 10xxxxxx`
* `1110xxxx 10xxxxxx 10xxxxxx`
* `11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
* `111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
* `1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
* `11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
The first four bytes determine the type:
* `0`: Value is a register.
* `1`: Value is immediate.
All other values are reserved. Overlong values are allowed, and for some
argument values they are necessary. All lines are terminated by a byte
of all zeros.

410
creole.c Normal file
View File

@ -0,0 +1,410 @@
#include "creole.h"
/*************************************************************************
* Static information
************************************************************************/
/* Arguments to opcodes can accept the following:
* immediate values only (as of now, no values are like this)
* register values only (push, pop, etc.)
* either values (math operations)
* labels (jumps)
* none (do not give an argument)
*/
enum creole_arg_type {
TYPE_NONE,
TYPE_IMM,
TYPE_REG,
TYPE_VAL,
TYPE_LAB,
CREOLE_ARG_TYPE_LEN
};
/* C99+ allows for designating the array index when initializing arrays:
[i] = v,
* in C89 indicies are implicit from 0 to the maximum filled-in value.
*/
#define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}}
static const struct {
unsigned arglen;
enum creole_arg_type argtype[CREOLE_MAX_ARG];
} opcode_info[CREOLE_OPCODE_LEN] = {
defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE),
defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE),
defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE),
defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE)
};
/*************************************************************************
* Reading from the buffer
************************************************************************/
static int read(struct creole_reader *r)
{
if (r->left == 0)
return -1;
r->left--;
return *r->p++;
}
static int read_eof(struct creole_reader *r)
{
return r->left == 0;
}
#if 0
/*************************************************************************
* Pseudo-UTF-8 lexing
*
* Pseudo-UTF-8 is based off of UTF-8 but adds more
* bytes and allows (requires!) overlong encodings.
*
* Possible values:
* 0xxxxxxx (7 bits)
* 110xxxxx 10xxxxxx (11 bits)
* 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
* 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
/* A Psuedo-UTF-8 sequence can be either
*
* * A 1 byte sequence, where the lower 7 bits are the encoded
* * word (no high bits).
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
struct word {
int len;
int high_bits;
creole_word word;
};
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
static int read_continue(struct creole_reader *r, struct encoded_word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
for (i = 0; i < to_read) {
r_ret = read(r);
if (r_ret < 0)
return 0;
/* Characters might not be 8 bits! */
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2)
return 0;
w->word = w->word << 6 | (c & 0x6);
}
return 1;
}
/* Start bytes must be treated differently. Depending on the scenario,
* start bytes will contain parts of the encoded word and high-bit flags.
* In some cases, not all of the high-bit flags are part of the start
* byte.
*/
#define START_BYTE_NUM 7
static int parse_start_byte(unsigned char c, struct word *w)
{
static const struct {
/* The algorithm compares the mask to the start byte
* by shifting both to the right by the amount of 'x's
* (blank spaces). The array is arranged in reverse
* order so that the index indicates the amount of
* bits to shift.
*/
unsigned char mask;
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
* to 0.
*/
unsigned char word_mask;
/* The high bits may not start from the LSB. There needs
* to be a shift to get the bits to the LSB, and a mask
* to discard the higher bits.
*/
unsigned char high_bit_mask;
int high_bit_shift;
/* The amount of NORMAL continuation bytes to read.
* This does NOT include continuation bytes that have
* high-bit flags in them.
*/
int to_read;
} start_data[START_BYTE_NUM] {
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
{0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
/* The single byte sequence has no high bits. */
{0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
};
int i;
for (i = 0; i < START_BYTE_NUM; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
w->high_bits = (c >> start_data[i].high_bit_shift)
& start_data[i].high_bit_mask;
return start_data[i].to_read;
}
}
return -1;
}
/* This parses the first continuation byte if it is special. */
#define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3)
static void parse_special_byte(unsigned char c, struct word *w)
{
/* The index denotes the amount of high bits that were in
* the start byte. This is the amount that the stored value
* must be shifted.
*
* The amount of bits that must be shifted out in the continue
* byte increase with the index. The amount shifted is (i + 2).
*
* Each value stored in the array is the mask applied after
* shifting the continue byte bits.
*/
static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = {
0xF, /* 1111110 10HHHHxx */
0x7, /* 111110H 10HHHxxx */
0x3, /* 11110HH 10HHxxxx */
0x1 /* 1110HHH 10Hxxxxx */
};
int i = w->len - START_BYTE_NUM;
w->high_bits = (w->high_bits << i) | ((c >> (2 + i)) & mask[i]);
}
/* Parse an entire Pseudo-UTF8 sequence. */
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
unsigned char c;
int to_read;
r_ret = read(r);
if (r_ret < 0)
return 0;
to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w);
if (to_read < 0)
return 0;
/* If to_read is not one less than w->len, that means there are
* high bits in the first continuation byte.
*/
if (w->len - to_read > 1) {
r_ret = read(r);
if (r_ret < 0)
return 0;
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
}
return read_continue(r, decoded_word, to_read);
}
/*************************************************************************
* Parsing instructions
*
* This parses an entire instruction, which is
* a single byte sequence,
* zero or more multibyte sequences,
* one single byte of all zeros.
*************************************************************************/
int creole_parse_line(struct creole_ins *ins, struct creole_reader *r)
{
struct word w = {0};
unsigned arg = 0;
if (!decode_seq(r, &w))
return 0;
ins->opcode = w.word;
if (w.word < CREOLE_ARG_TYPE_LEN || w.len != 1)
return 0;
for (arg = 0; arg < arglen; arg++) {
if (!decode_seq(r, &w))
return 0;
if (w.len == 1)
return 0;
ins->w[arg] = w.word;
ins->w_flags[arg] = w.high_bits;
}
if (!decode_seq(r, &w))
return 0;
if (w.word != 0 || w.len != 1)
return 0;
return 1;
}
/**************************************************************************
* High level compiling interface
*************************************************************************/
static void clear_instruction(struct creole_env *env,
struct creole_ins *ins)
{
memset(ins, 0, sizeof(ins));
env->prgptr--;
}
static int typecheck(enum creole_word_flag fl, enum creole_arg_type typ)
{
switch (typ) {
case TYPE_NONE: return 0;
case TYPE_IMM: return fl == CREOLE_IMMEDIATE;
case TYPE_REG: return fl == CREOLE_REGISTER;
case TYPE_VAL: return fl == CREOLE_IMMEDIATE
| fl == CREOLE_REGISTER;
case TYPE_LAB: return fl == CREOLE_IMMEDIATE;
default: return 0;
}
}
static enum creole_compiler_ret typecheck_ins(struct creole_env *env,
struct creole_ins *ins)
{
unsigned i;
for (i = 0; i < opcode_info[env->opcode].arglen; i++) {
if (!typecheck(ins->w[i],
opcode_info[env->opcode].argtype[i]))
return CREOLE_TYPE_ERROR;
}
return CREOLE_COMPILE_OK;
}
static enum creole_compiler_ret
handle_compiletime_immediate(struct creole_env *env,
struct creole_ins *ins)
{
switch (ins->opcode) {
case CREOLE_CLB:
if (ins->w[0] >= ins->lablen)
return CREOLE_LABEL_OVERFLOW;
ins->lab[ins->w[0]] = env->prgptr;
/* Delete instruction because it is a compile time
* instruction. Place next instruction in its place. */
clear_instruction(env, ins);
return CREOLE_COMPILE_OK;
case CREOLE_NOOP:
clear_instruction(env, ins);
return CREOLE_COMPILE_OK;
default:
return typecheck_ins(env, ins);
}
}
int creole_compile(struct creole_env *env, struct creole_reader *r)
{
struct creole_ins *cur_ins = env->prg;
int rcode;
while (env->prgptr < env->prglen) {
if (!creole_parse_line(cur_ins, r))
return CREOLE_PARSE_ERROR;
/* Increase prgptr here. If the instruction is a compile
* time instruction, then this will be decremented since
* the instruction will not be executed.
*/
env->prgptr++;
rcode = handle_compiletime_immediate(env, cur_ins);
if (rcode != CREOLE_COMPILE_OK)
return rcode;
if (read_eof(r))
break;
cur_ins += 1;
}
if (env->prgptr == env->prglen && *line)
return CREOLE_PROGRAM_OVERFLOW;
env->prgend = env->prgptr;
env->prgptr = 0;
return CREOLE_COMPILE_OK;
}
static creole_word read_word(struct creole_ins *ins, int i)
{
if (env->w_flags[i] == CREOLE_REGISTER)
return env->reg[env->w[i]];
else
return env->w[i];
}
int creole_step(struct creole_env *env)
{
struct creole_ins *ins = env->prg + env->prgptr;
creole_word a1, a2;
if (env->prgptr == env->prgend)
return CREOLE_STEP_STOP;
env->prgptr++;
switch (ins->opcode) {
case CREOLE_PUSH:
if (env->stkptr == env->stklen)
return CREOLE_STACK_OVERFLOW;
env->stk[env->stkptr++] = env->reg[env->w[0]];
break;
case CREOLE_POP:
if (env->stkptr == 0)
return CREOLE_STACK_OVERFLOW;
env->reg[env->w[0]] = env->stk[--env->stkptr];
break;
case CREOLE_ADD:
a1 = read_word(ins, 1);
a2 = read_word(ins, 2);
env->reg[env->w[0]] = a1 + a2;
break;
case CREOLE_MUL:
a1 = read_word(ins, 1);
a2 = read_word(ins, 2);
env->reg[env->w[0]] = a1 * a2;
break;
case CREOLE_DIV:
a1 = read_word(ins, 1);
a2 = read_word(ins, 2);
env->reg[env->w[0]] = a1 / a2;
break;
case CREOLE_JL:
a1 = read_word(ins, 1);
a2 = read_word(ins, 2);
if (a1 < a2)
env->prgptr = env->lab[env->w[0]];
break;
case SYS:
a1 = read_word(ins, 1);
/* do syscall */
break;
}
}
#endif

67
creole.h Normal file
View File

@ -0,0 +1,67 @@
#ifndef CREOLE_H
#define CREOLE_H
#include <limits.h>
#ifndef CREOLE_WORD
# define CREOLE_WORD unsigned int
# define CREOLE_WORD_MAX UINT_MAX
#endif
#define CREOLE_MAX_ARG 3
typedef CREOLE_WORD creole_word;
enum creole_opcode {
CREOLE_NOOP = 0,
CREOLE_PUSH = 1,
CREOLE_POP = 2,
CREOLE_ADD = 3,
CREOLE_MUL = 4,
CREOLE_DIV = 5,
CREOLE_JL = 6,
CREOLE_CLB = 7,
CREOLE_SYS = 8,
CREOLE_OPCODE_LEN
};
enum creole_word_flag {
CREOLE_IMMEDIATE,
CREOLE_REGISTER,
CREOLE_WORD_FLAGS_LEN
};
enum creole_compiler_ret {
CREOLE_COMPILE_OK,
CREOLE_COMPILE_PARSE_ERROR,
CREOLE_LABEL_OVERFLOW,
CREOLE_TYPE_ERROR,
CREOLE_COMPILE_RET_LEN
};
struct creole_ins {
enum creole_opcode opcode;
unsigned char w_flags[3];
creole_word w[3];
};
struct creole_env {
creole_word *reg;
size_t reglen;
size_t *lab;
size_t lablen;
creole_word *stk;
size_t stkptr, stklen;
struct creole_ins *prg;
size_t prgptr, prgend, prglen;
};
struct creole_reader {
unsigned char *p;
size_t left;
};
#endif /* CREOLE_H */

1266
greatest.h Normal file

File diff suppressed because it is too large Load Diff

44
test_creole.c Normal file
View File

@ -0,0 +1,44 @@
#include "greatest.h"
GREATEST_MAIN_DEFS();
#include "creole.c"
/**************************************************************************
* Reader suite
*************************************************************************/
#define reader_lit(r, s) do { \
r.p = (unsigned char *)s; \
r.left = sizeof(s) - 1; \
} while(0)
TEST reader_test_basic(struct creole_reader *r) {
size_t i = 0;
unsigned char *s = r->p;
size_t len = r->left;
for (i = 0; i < len; i++) {
ASSERT_EQ(read_eof(r), 0);
ASSERT_EQ(read(r), s[i]);
}
for (i = 0; i < 5; i++) {
ASSERT_EQ(read_eof(r), 1);
ASSERT_EQ(read(r), -1);
}
PASS();
}
SUITE(reader) {
struct creole_reader r = {0};
reader_lit(r, "abcdefg");
RUN_TEST1(reader_test_basic, &r);
reader_lit(r, "");
RUN_TEST1(reader_test_basic, &r);
}
int main(int argc, char *argv[]) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(reader);
GREATEST_MAIN_END();
}