prototype bytecode interpreter
This commit is contained in:
commit
f63d6cdd3d
|
@ -0,0 +1 @@
|
|||
test_creole
|
|
@ -0,0 +1,14 @@
|
|||
Copyright (c) 2023 Peter McGoron <code@mcgoron.com>
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted, provided that the above
|
||||
copyright notice and this permission notice appear in all copies.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
test_creole: test_creole.c creole.c creole.h greatest.h
|
||||
$(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole
|
|
@ -0,0 +1,23 @@
|
|||
Creole is a bytecode designed for simple implementations.
|
||||
|
||||
## Bytecode Format
|
||||
|
||||
Each creole line consists of pseudo-UTF-8 characters. The first byte
|
||||
is an unsigned number between 0 and 127 (the high bit is clear). Each
|
||||
suceeding pseudo-UTF-8 character is encoded as follows:
|
||||
|
||||
* `110xxxxx 10xxxxxx`
|
||||
* `1110xxxx 10xxxxxx 10xxxxxx`
|
||||
* `11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
|
||||
* `111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
|
||||
* `1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
|
||||
* `11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx`
|
||||
|
||||
The first four bytes determine the type:
|
||||
|
||||
* `0`: Value is a register.
|
||||
* `1`: Value is immediate.
|
||||
|
||||
All other values are reserved. Overlong values are allowed, and for some
|
||||
argument values they are necessary. All lines are terminated by a byte
|
||||
of all zeros.
|
|
@ -0,0 +1,410 @@
|
|||
#include "creole.h"
|
||||
|
||||
/*************************************************************************
|
||||
* Static information
|
||||
************************************************************************/
|
||||
|
||||
/* Arguments to opcodes can accept the following:
|
||||
* immediate values only (as of now, no values are like this)
|
||||
* register values only (push, pop, etc.)
|
||||
* either values (math operations)
|
||||
* labels (jumps)
|
||||
* none (do not give an argument)
|
||||
*/
|
||||
enum creole_arg_type {
|
||||
TYPE_NONE,
|
||||
TYPE_IMM,
|
||||
TYPE_REG,
|
||||
TYPE_VAL,
|
||||
TYPE_LAB,
|
||||
CREOLE_ARG_TYPE_LEN
|
||||
};
|
||||
|
||||
/* C99+ allows for designating the array index when initializing arrays:
|
||||
[i] = v,
|
||||
* in C89 indicies are implicit from 0 to the maximum filled-in value.
|
||||
*/
|
||||
#define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}}
|
||||
static const struct {
|
||||
unsigned arglen;
|
||||
enum creole_arg_type argtype[CREOLE_MAX_ARG];
|
||||
} opcode_info[CREOLE_OPCODE_LEN] = {
|
||||
defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE),
|
||||
defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
|
||||
defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE),
|
||||
defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
|
||||
defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
|
||||
defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
|
||||
defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
|
||||
defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE),
|
||||
defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE)
|
||||
};
|
||||
|
||||
/*************************************************************************
|
||||
* Reading from the buffer
|
||||
************************************************************************/
|
||||
|
||||
static int read(struct creole_reader *r)
|
||||
{
|
||||
if (r->left == 0)
|
||||
return -1;
|
||||
r->left--;
|
||||
return *r->p++;
|
||||
}
|
||||
|
||||
static int read_eof(struct creole_reader *r)
|
||||
{
|
||||
return r->left == 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
/*************************************************************************
|
||||
* Pseudo-UTF-8 lexing
|
||||
*
|
||||
* Pseudo-UTF-8 is based off of UTF-8 but adds more
|
||||
* bytes and allows (requires!) overlong encodings.
|
||||
*
|
||||
* Possible values:
|
||||
* 0xxxxxxx (7 bits)
|
||||
* 110xxxxx 10xxxxxx (11 bits)
|
||||
* 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
|
||||
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
|
||||
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
|
||||
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
|
||||
* 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
|
||||
* 10xxxxxx
|
||||
************************************************************************/
|
||||
|
||||
/* A Psuedo-UTF-8 sequence can be either
|
||||
*
|
||||
* * A 1 byte sequence, where the lower 7 bits are the encoded
|
||||
* * word (no high bits).
|
||||
|
||||
* * A multi-byte sequence where the 4 MSB are flags, and the
|
||||
* * lower bits are the encoded word.
|
||||
*/
|
||||
struct word {
|
||||
int len;
|
||||
int high_bits;
|
||||
creole_word word;
|
||||
};
|
||||
|
||||
/* Decode a set of continuation bytes directly into the word. This assumes
|
||||
* that each continuation byte contains no high words.
|
||||
*/
|
||||
static int read_continue(struct creole_reader *r, struct encoded_word *w,
|
||||
int to_read)
|
||||
{
|
||||
int i;
|
||||
int r_ret;
|
||||
unsigned char c;
|
||||
|
||||
for (i = 0; i < to_read) {
|
||||
r_ret = read(r);
|
||||
if (r_ret < 0)
|
||||
return 0;
|
||||
/* Characters might not be 8 bits! */
|
||||
c = (unsigned char)(r_ret & 0xFF);
|
||||
if (c >> 6 != 0x2)
|
||||
return 0;
|
||||
w->word = w->word << 6 | (c & 0x6);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Start bytes must be treated differently. Depending on the scenario,
|
||||
* start bytes will contain parts of the encoded word and high-bit flags.
|
||||
* In some cases, not all of the high-bit flags are part of the start
|
||||
* byte.
|
||||
*/
|
||||
#define START_BYTE_NUM 7
|
||||
static int parse_start_byte(unsigned char c, struct word *w)
|
||||
{
|
||||
static const struct {
|
||||
/* The algorithm compares the mask to the start byte
|
||||
* by shifting both to the right by the amount of 'x's
|
||||
* (blank spaces). The array is arranged in reverse
|
||||
* order so that the index indicates the amount of
|
||||
* bits to shift.
|
||||
*/
|
||||
unsigned char mask;
|
||||
/* The word bits, if they exist, always start from the
|
||||
* LSB, so there is no need to shift the bits away. The
|
||||
* word_mask gets the low bits. If there are no bits, set
|
||||
* to 0.
|
||||
*/
|
||||
unsigned char word_mask;
|
||||
|
||||
/* The high bits may not start from the LSB. There needs
|
||||
* to be a shift to get the bits to the LSB, and a mask
|
||||
* to discard the higher bits.
|
||||
*/
|
||||
unsigned char high_bit_mask;
|
||||
int high_bit_shift;
|
||||
|
||||
/* The amount of NORMAL continuation bytes to read.
|
||||
* This does NOT include continuation bytes that have
|
||||
* high-bit flags in them.
|
||||
*/
|
||||
int to_read;
|
||||
} start_data[START_BYTE_NUM] {
|
||||
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
|
||||
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
|
||||
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
|
||||
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
|
||||
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
|
||||
{0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
|
||||
/* The single byte sequence has no high bits. */
|
||||
{0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
|
||||
};
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < START_BYTE_NUM; i++) {
|
||||
if (c >> i == start_data[i].mask >> i) {
|
||||
w->len = START_BYTE_NUM - i;
|
||||
w->word = c & start_data[i].word_mask;
|
||||
w->high_bits = (c >> start_data[i].high_bit_shift)
|
||||
& start_data[i].high_bit_mask;
|
||||
return start_data[i].to_read;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* This parses the first continuation byte if it is special. */
|
||||
#define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3)
|
||||
static void parse_special_byte(unsigned char c, struct word *w)
|
||||
{
|
||||
/* The index denotes the amount of high bits that were in
|
||||
* the start byte. This is the amount that the stored value
|
||||
* must be shifted.
|
||||
*
|
||||
* The amount of bits that must be shifted out in the continue
|
||||
* byte increase with the index. The amount shifted is (i + 2).
|
||||
*
|
||||
* Each value stored in the array is the mask applied after
|
||||
* shifting the continue byte bits.
|
||||
*/
|
||||
static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = {
|
||||
0xF, /* 1111110 10HHHHxx */
|
||||
0x7, /* 111110H 10HHHxxx */
|
||||
0x3, /* 11110HH 10HHxxxx */
|
||||
0x1 /* 1110HHH 10Hxxxxx */
|
||||
};
|
||||
int i = w->len - START_BYTE_NUM;
|
||||
w->high_bits = (w->high_bits << i) | ((c >> (2 + i)) & mask[i]);
|
||||
}
|
||||
|
||||
/* Parse an entire Pseudo-UTF8 sequence. */
|
||||
static int decode_seq(struct creole_reader *r, struct word *w)
|
||||
{
|
||||
int r_ret;
|
||||
unsigned char c;
|
||||
int to_read;
|
||||
|
||||
r_ret = read(r);
|
||||
if (r_ret < 0)
|
||||
return 0;
|
||||
|
||||
to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w);
|
||||
if (to_read < 0)
|
||||
return 0;
|
||||
|
||||
/* If to_read is not one less than w->len, that means there are
|
||||
* high bits in the first continuation byte.
|
||||
*/
|
||||
if (w->len - to_read > 1) {
|
||||
r_ret = read(r);
|
||||
if (r_ret < 0)
|
||||
return 0;
|
||||
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
|
||||
}
|
||||
|
||||
return read_continue(r, decoded_word, to_read);
|
||||
}
|
||||
|
||||
/*************************************************************************
|
||||
* Parsing instructions
|
||||
*
|
||||
* This parses an entire instruction, which is
|
||||
* a single byte sequence,
|
||||
* zero or more multibyte sequences,
|
||||
* one single byte of all zeros.
|
||||
*************************************************************************/
|
||||
|
||||
int creole_parse_line(struct creole_ins *ins, struct creole_reader *r)
|
||||
{
|
||||
struct word w = {0};
|
||||
unsigned arg = 0;
|
||||
|
||||
if (!decode_seq(r, &w))
|
||||
return 0;
|
||||
|
||||
ins->opcode = w.word;
|
||||
if (w.word < CREOLE_ARG_TYPE_LEN || w.len != 1)
|
||||
return 0;
|
||||
|
||||
for (arg = 0; arg < arglen; arg++) {
|
||||
if (!decode_seq(r, &w))
|
||||
return 0;
|
||||
if (w.len == 1)
|
||||
return 0;
|
||||
ins->w[arg] = w.word;
|
||||
ins->w_flags[arg] = w.high_bits;
|
||||
}
|
||||
|
||||
if (!decode_seq(r, &w))
|
||||
return 0;
|
||||
if (w.word != 0 || w.len != 1)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* High level compiling interface
|
||||
*************************************************************************/
|
||||
|
||||
static void clear_instruction(struct creole_env *env,
|
||||
struct creole_ins *ins)
|
||||
{
|
||||
memset(ins, 0, sizeof(ins));
|
||||
env->prgptr--;
|
||||
}
|
||||
|
||||
static int typecheck(enum creole_word_flag fl, enum creole_arg_type typ)
|
||||
{
|
||||
switch (typ) {
|
||||
case TYPE_NONE: return 0;
|
||||
case TYPE_IMM: return fl == CREOLE_IMMEDIATE;
|
||||
case TYPE_REG: return fl == CREOLE_REGISTER;
|
||||
case TYPE_VAL: return fl == CREOLE_IMMEDIATE
|
||||
| fl == CREOLE_REGISTER;
|
||||
case TYPE_LAB: return fl == CREOLE_IMMEDIATE;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static enum creole_compiler_ret typecheck_ins(struct creole_env *env,
|
||||
struct creole_ins *ins)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < opcode_info[env->opcode].arglen; i++) {
|
||||
if (!typecheck(ins->w[i],
|
||||
opcode_info[env->opcode].argtype[i]))
|
||||
return CREOLE_TYPE_ERROR;
|
||||
}
|
||||
return CREOLE_COMPILE_OK;
|
||||
}
|
||||
|
||||
static enum creole_compiler_ret
|
||||
handle_compiletime_immediate(struct creole_env *env,
|
||||
struct creole_ins *ins)
|
||||
{
|
||||
switch (ins->opcode) {
|
||||
case CREOLE_CLB:
|
||||
if (ins->w[0] >= ins->lablen)
|
||||
return CREOLE_LABEL_OVERFLOW;
|
||||
ins->lab[ins->w[0]] = env->prgptr;
|
||||
/* Delete instruction because it is a compile time
|
||||
* instruction. Place next instruction in its place. */
|
||||
clear_instruction(env, ins);
|
||||
return CREOLE_COMPILE_OK;
|
||||
case CREOLE_NOOP:
|
||||
clear_instruction(env, ins);
|
||||
return CREOLE_COMPILE_OK;
|
||||
default:
|
||||
return typecheck_ins(env, ins);
|
||||
}
|
||||
}
|
||||
|
||||
int creole_compile(struct creole_env *env, struct creole_reader *r)
|
||||
{
|
||||
struct creole_ins *cur_ins = env->prg;
|
||||
int rcode;
|
||||
|
||||
while (env->prgptr < env->prglen) {
|
||||
if (!creole_parse_line(cur_ins, r))
|
||||
return CREOLE_PARSE_ERROR;
|
||||
/* Increase prgptr here. If the instruction is a compile
|
||||
* time instruction, then this will be decremented since
|
||||
* the instruction will not be executed.
|
||||
*/
|
||||
env->prgptr++;
|
||||
|
||||
rcode = handle_compiletime_immediate(env, cur_ins);
|
||||
if (rcode != CREOLE_COMPILE_OK)
|
||||
return rcode;
|
||||
|
||||
if (read_eof(r))
|
||||
break;
|
||||
cur_ins += 1;
|
||||
}
|
||||
|
||||
if (env->prgptr == env->prglen && *line)
|
||||
return CREOLE_PROGRAM_OVERFLOW;
|
||||
env->prgend = env->prgptr;
|
||||
env->prgptr = 0;
|
||||
return CREOLE_COMPILE_OK;
|
||||
}
|
||||
|
||||
static creole_word read_word(struct creole_ins *ins, int i)
|
||||
{
|
||||
if (env->w_flags[i] == CREOLE_REGISTER)
|
||||
return env->reg[env->w[i]];
|
||||
else
|
||||
return env->w[i];
|
||||
}
|
||||
|
||||
int creole_step(struct creole_env *env)
|
||||
{
|
||||
struct creole_ins *ins = env->prg + env->prgptr;
|
||||
creole_word a1, a2;
|
||||
|
||||
if (env->prgptr == env->prgend)
|
||||
return CREOLE_STEP_STOP;
|
||||
env->prgptr++;
|
||||
|
||||
switch (ins->opcode) {
|
||||
case CREOLE_PUSH:
|
||||
if (env->stkptr == env->stklen)
|
||||
return CREOLE_STACK_OVERFLOW;
|
||||
env->stk[env->stkptr++] = env->reg[env->w[0]];
|
||||
break;
|
||||
case CREOLE_POP:
|
||||
if (env->stkptr == 0)
|
||||
return CREOLE_STACK_OVERFLOW;
|
||||
env->reg[env->w[0]] = env->stk[--env->stkptr];
|
||||
break;
|
||||
case CREOLE_ADD:
|
||||
a1 = read_word(ins, 1);
|
||||
a2 = read_word(ins, 2);
|
||||
env->reg[env->w[0]] = a1 + a2;
|
||||
break;
|
||||
case CREOLE_MUL:
|
||||
a1 = read_word(ins, 1);
|
||||
a2 = read_word(ins, 2);
|
||||
env->reg[env->w[0]] = a1 * a2;
|
||||
break;
|
||||
case CREOLE_DIV:
|
||||
a1 = read_word(ins, 1);
|
||||
a2 = read_word(ins, 2);
|
||||
env->reg[env->w[0]] = a1 / a2;
|
||||
break;
|
||||
case CREOLE_JL:
|
||||
a1 = read_word(ins, 1);
|
||||
a2 = read_word(ins, 2);
|
||||
if (a1 < a2)
|
||||
env->prgptr = env->lab[env->w[0]];
|
||||
break;
|
||||
case SYS:
|
||||
a1 = read_word(ins, 1);
|
||||
/* do syscall */
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,67 @@
|
|||
#ifndef CREOLE_H
|
||||
#define CREOLE_H
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#ifndef CREOLE_WORD
|
||||
# define CREOLE_WORD unsigned int
|
||||
# define CREOLE_WORD_MAX UINT_MAX
|
||||
#endif
|
||||
|
||||
#define CREOLE_MAX_ARG 3
|
||||
|
||||
typedef CREOLE_WORD creole_word;
|
||||
|
||||
enum creole_opcode {
|
||||
CREOLE_NOOP = 0,
|
||||
CREOLE_PUSH = 1,
|
||||
CREOLE_POP = 2,
|
||||
CREOLE_ADD = 3,
|
||||
CREOLE_MUL = 4,
|
||||
CREOLE_DIV = 5,
|
||||
CREOLE_JL = 6,
|
||||
CREOLE_CLB = 7,
|
||||
CREOLE_SYS = 8,
|
||||
CREOLE_OPCODE_LEN
|
||||
};
|
||||
|
||||
enum creole_word_flag {
|
||||
CREOLE_IMMEDIATE,
|
||||
CREOLE_REGISTER,
|
||||
CREOLE_WORD_FLAGS_LEN
|
||||
};
|
||||
|
||||
enum creole_compiler_ret {
|
||||
CREOLE_COMPILE_OK,
|
||||
CREOLE_COMPILE_PARSE_ERROR,
|
||||
CREOLE_LABEL_OVERFLOW,
|
||||
CREOLE_TYPE_ERROR,
|
||||
CREOLE_COMPILE_RET_LEN
|
||||
};
|
||||
|
||||
struct creole_ins {
|
||||
enum creole_opcode opcode;
|
||||
unsigned char w_flags[3];
|
||||
creole_word w[3];
|
||||
};
|
||||
|
||||
struct creole_env {
|
||||
creole_word *reg;
|
||||
size_t reglen;
|
||||
|
||||
size_t *lab;
|
||||
size_t lablen;
|
||||
|
||||
creole_word *stk;
|
||||
size_t stkptr, stklen;
|
||||
|
||||
struct creole_ins *prg;
|
||||
size_t prgptr, prgend, prglen;
|
||||
};
|
||||
|
||||
struct creole_reader {
|
||||
unsigned char *p;
|
||||
size_t left;
|
||||
};
|
||||
|
||||
#endif /* CREOLE_H */
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,44 @@
|
|||
#include "greatest.h"
|
||||
GREATEST_MAIN_DEFS();
|
||||
#include "creole.c"
|
||||
|
||||
/**************************************************************************
|
||||
* Reader suite
|
||||
*************************************************************************/
|
||||
#define reader_lit(r, s) do { \
|
||||
r.p = (unsigned char *)s; \
|
||||
r.left = sizeof(s) - 1; \
|
||||
} while(0)
|
||||
|
||||
TEST reader_test_basic(struct creole_reader *r) {
|
||||
size_t i = 0;
|
||||
unsigned char *s = r->p;
|
||||
size_t len = r->left;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
ASSERT_EQ(read_eof(r), 0);
|
||||
ASSERT_EQ(read(r), s[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < 5; i++) {
|
||||
ASSERT_EQ(read_eof(r), 1);
|
||||
ASSERT_EQ(read(r), -1);
|
||||
}
|
||||
PASS();
|
||||
}
|
||||
|
||||
SUITE(reader) {
|
||||
struct creole_reader r = {0};
|
||||
|
||||
reader_lit(r, "abcdefg");
|
||||
RUN_TEST1(reader_test_basic, &r);
|
||||
|
||||
reader_lit(r, "");
|
||||
RUN_TEST1(reader_test_basic, &r);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
GREATEST_MAIN_BEGIN();
|
||||
RUN_SUITE(reader);
|
||||
GREATEST_MAIN_END();
|
||||
}
|
Loading…
Reference in New Issue