creole/creole.c

668 lines
18 KiB
C

/* Copyright (c) 2023 Peter McGoron <code@mcgoron.com>
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted, provided that the above
copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "creole.h"
/*************************************************************************
* Static information
************************************************************************/
/* Arguments to opcodes can accept the following:
* immediate values only (as of now, no values are like this)
* register values only (push, pop, etc.)
* either values (math operations)
* labels (jumps)
* none (do not give an argument)
*/
enum creole_arg_type {
TYPE_NONE,
TYPE_IMM,
TYPE_REG,
TYPE_VAL,
TYPE_LAB,
CREOLE_ARG_TYPE_LEN
};
/* C99+ allows for designating the array index when initializing arrays:
[i] = v,
* in C89 indicies are implicit from 0 to the maximum filled-in value.
*/
#define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}}
static const struct {
unsigned arglen;
enum creole_arg_type argtype[CREOLE_MAX_ARG];
} opcode_info[CREOLE_OPCODE_LEN] = {
defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE),
defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE),
defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE),
defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JLE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JNE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL)
};
/*************************************************************************
* Reading from the buffer
************************************************************************/
static int read(struct creole_reader *r)
{
if (r->left == 0)
return -1;
r->left--;
return *r->p++;
}
static int read_eof(struct creole_reader *r)
{
return r->left == 0;
}
/*************************************************************************
* Pseudo-UTF-8 lexing
*
* Pseudo-UTF-8 is based off of UTF-8 but adds more
* bytes and allows (requires!) overlong encodings.
*
* Possible values:
* 0xxxxxxx (7 bits)
* 110HHHHx 10xxxxxx (11 bits)
* 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
* 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
* 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
* 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
* 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
/* A Psuedo-UTF-8 sequence can be either
*
* * A 1 byte sequence, where the lower 7 bits are the encoded
* * word (no high bits).
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
#define MAX_HIGH_BITS 15
struct word {
int len;
int high_bits;
creole_word word;
};
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
static int read_continue(struct creole_reader *r, struct word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
for (i = 0; i < to_read; i++) {
r_ret = read(r);
if (r_ret < 0) {
return 0;
}
/* Characters might not be 8 bits! */
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2) {
return 0;
}
w->word = (w->word << 6) | (c & 0x3F);
}
return 1;
}
/* Start bytes must be treated differently. Depending on the scenario,
* start bytes will contain parts of the encoded word and high-bit flags.
* In some cases, not all of the high-bit flags are part of the start
* byte.
*/
#define START_BYTE_NUM 7
static int parse_start_byte(unsigned char c, struct word *w)
{
static const struct {
/* The algorithm compares the mask to the start byte
* by shifting both to the right by the amount of 'x's
* (blank spaces). The array is arranged in reverse
* order so that the index indicates the amount of
* bits to shift.
*/
unsigned char mask;
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
* to 0.
*/
unsigned char word_mask;
/* The high bits may not start from the LSB. There needs
* to be a shift to get the bits to the LSB, and a mask
* to discard the higher bits.
*/
unsigned char high_bit_mask;
int high_bit_shift;
/* The amount of NORMAL continuation bytes to read.
* This does NOT include continuation bytes that have
* high-bit flags in them.
*/
int to_read;
} start_data[START_BYTE_NUM-1] = {
{0xFE, 0x00, 0x0, 0, 5}, /* 11111110 */
{0xFC, 0x00, 0x1, 0, 4}, /* 1111110x */
{0xF8, 0x00, 0x3, 0, 3}, /* 111110xx */
{0xF0, 0x00, 0x7, 0, 2}, /* 11110xxx */
{0xE0, 0x00, 0xF, 0, 2}, /* 1110xxxx */
{0xC0, 0x01, 0xF, 1, 1} /* 110xxxxx */
};
int i;
for (i = 0; i < START_BYTE_NUM-1; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
w->high_bits = (c >> start_data[i].high_bit_shift)
& start_data[i].high_bit_mask;
return start_data[i].to_read;
}
}
/* i == 7 */
if (c >> 7 == 0) {
w->len = 1;
w->word = c;
w->high_bits = 0;
return 0;
}
return -1;
}
/* This parses the first continuation byte if it is special. */
#define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3)
static int parse_special_byte(unsigned char c, struct word *w)
{
/* The index denotes the amount of high bits that were in
* the start byte. This is the amount that the stored value
* must be shifted.
*
* The amount of bits that must be shifted out in the continue
* byte increase with the index. The amount shifted is (i + 2).
*
* Each value stored in the array is the mask applied after
* shifting the continue byte bits.
*/
static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = {
0x1, /* 11110HHH 10Hxxxxx */
0x3, /* 111110HH 10HHxxxx */
0x7, /* 1111110H 10HHHxxx */
0xF /* 11111110 10HHHHxx */
};
static const unsigned char wordmask[SPECIAL_CONTINUE_BYTE_NUM] = {
0x1F, 0xF, 0x7, 0x3
};
int i = w->len - 4;
if (i >= SPECIAL_CONTINUE_BYTE_NUM)
return 0;
w->high_bits = (w->high_bits << (i + 1)) | ((c >> (5 - i)) & mask[i]);
w->word = c & wordmask[i];
return 1;
}
/* Parse an entire Pseudo-UTF8 sequence. */
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
int to_read;
w->high_bits = 0;
r_ret = read(r);
if (r_ret < 0)
return 0;
to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w);
if (to_read < 0)
return 0;
/* If to_read is not one less than w->len, that means there are
* high bits in the first continuation byte.
*/
if (w->len - to_read > 1) {
r_ret = read(r);
if (r_ret < 0)
return 0;
if (!parse_special_byte((unsigned char)(r_ret & 0xFF), w))
return 0;
}
return read_continue(r, w, to_read);
}
int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
unsigned char buf[7])
{
static const struct {
creole_word max;
unsigned char b1_mask;
int high_bit_shift_b1;
int high_bit_shift_to_right_b1;
int data_shift_b1;
int high_bit_mask_b2;
int high_bit_shift_b2;
unsigned char b2_data_mask;
} d[] = {
{0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
{0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
{0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
{0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
{0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
{0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
};
int lb;
unsigned j;
if (encode_to > 8)
return 0;
if (encode_to == 1) {
if (i < 0x80) {
buf[0] = i;
return 1;
}
return 0;
}
lb = encode_to - 2;
if (i > d[lb].max) {
return 0;
}
buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
<< d[lb].high_bit_shift_to_right_b1));
/* shifts greater than or equal to the bit size of a type are
* undefined. Data in the first byte is always aligned with the LSB.
*/
if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
buf[0] |= i >> d[lb].data_shift_b1;
buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
<< d[lb].high_bit_shift_b2)
| ((i >> ((encode_to - 2) * 6))
& d[lb].b2_data_mask);
for (j = 2; j < encode_to; j++) {
buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
}
return 1;
}
/*************************************************************************
* Parsing instructions
*
* This parses an entire instruction, which is
* a single byte sequence,
* zero or more multibyte sequences,
* one single byte of all zeros.
*************************************************************************/
enum creole_compiler_ret
creole_parse_line(struct creole_ins *ins, struct creole_reader *r)
{
struct word w = {0};
unsigned arg = 0;
if (!decode_seq(r, &w))
return CREOLE_OPCODE_READ_ERROR;
ins->opcode = w.word;
if (w.word >= CREOLE_OPCODE_LEN || w.len != 1) {
return CREOLE_OPCODE_MALFORMED;
}
for (arg = 0; arg < opcode_info[ins->opcode].arglen; arg++) {
if (!decode_seq(r, &w))
return CREOLE_ARG_READ_ERROR;
if (w.len == 1)
return CREOLE_ARG_MALFORMED;
ins->w[arg] = w.word;
ins->w_flags[arg] = w.high_bits;
}
if (!decode_seq(r, &w))
return CREOLE_LAST_READ_ERROR;
if (w.word != 0 || w.len != 1)
return CREOLE_LAST_MALFORMED;
return CREOLE_COMPILE_OK;
}
/**************************************************************************
* High level compiling interface
*************************************************************************/
static int valid_register(struct creole_env *env, int reg)
{
return reg < env->reglen;
}
static int valid_label(struct creole_env *env, int reg)
{
return reg < env->lablen;
}
static int typecheck(struct creole_env *env, int val,
enum creole_word_flag fl, enum creole_arg_type typ)
{
switch (typ) {
case TYPE_IMM: return fl == CREOLE_IMMEDIATE;
case TYPE_REG: return fl == CREOLE_REGISTER
&& valid_register(env, val);
case TYPE_VAL: return fl == CREOLE_IMMEDIATE
|| fl == CREOLE_REGISTER;
case TYPE_LAB: return fl == CREOLE_IMMEDIATE
&& valid_label(env, val);
default: return 0;
}
}
static enum creole_word_flag get_type_from_high_bit(unsigned high_bits)
{
if (high_bits & 1) {
return CREOLE_REGISTER;
} else {
return CREOLE_IMMEDIATE;
}
}
static enum creole_compiler_ret typecheck_ins(struct creole_env *env,
struct creole_ins *ins)
{
unsigned i;
for (i = 0; i < opcode_info[ins->opcode].arglen; i++) {
if (!typecheck(env, ins->w[i],
get_type_from_high_bit(ins->w_flags[i]),
opcode_info[ins->opcode].argtype[i]))
return CREOLE_TYPE_ERROR;
}
return CREOLE_COMPILE_OK;
}
static void clear_ins(struct creole_ins *i)
{
const struct creole_ins blank = {0};
*i = blank;
}
/****
* Get rid of instructions that can be written out at compile time.
***/
static enum creole_compiler_ret
handle_compiletime_immediate(struct creole_env *env,
struct creole_ins *cur_ins)
{
switch (cur_ins->opcode) {
case CREOLE_CLB:
if (cur_ins->w[0] >= env->lablen)
return CREOLE_LABEL_OVERFLOW;
env->lab[cur_ins->w[0]] = env->prgptr;
/* Delete instruction because it is a compile time
* instruction. Place next instruction in its place. */
clear_ins(cur_ins);
return CREOLE_COMPILE_CLEARED_INSTRUCTION;
case CREOLE_NOOP:
clear_ins(cur_ins);
return CREOLE_COMPILE_CLEARED_INSTRUCTION;
default:
return typecheck_ins(env, cur_ins);
}
}
/* TODO: The compile step can be completely removed in favor of directly
* executing the bytecode, disassembling it with creole_parse_line()
* at every instance. This will also make the implementation simpler.
*/
enum creole_compiler_ret
creole_compile(struct creole_env *env, struct creole_reader *r)
{
struct creole_ins *cur_ins = env->prg;
int rcode;
while (env->prgptr < env->prglen) {
rcode = creole_parse_line(cur_ins, r);
if (rcode != CREOLE_COMPILE_OK)
return rcode;
rcode = handle_compiletime_immediate(env, cur_ins);
switch (rcode) {
case CREOLE_COMPILE_CLEARED_INSTRUCTION:
break;
case CREOLE_COMPILE_OK:
cur_ins++;
env->prgptr++;
break;
default:
return rcode;
}
if (read_eof(r))
break;
}
if (env->prgptr == env->prglen && !read_eof(r))
return CREOLE_PROGRAM_OVERFLOW;
env->prgend = env->prgptr;
env->prgptr = 0;
return CREOLE_COMPILE_OK;
}
enum creole_run_ret creole_reg_write(struct creole_env *env, unsigned reg,
creole_word w)
{
if (!valid_register(env, reg)) {
return CREOLE_REGISTER_OVERFLOW;
}
env->reg[reg] = w;
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_reg_read(struct creole_env *env, unsigned reg,
creole_word *w)
{
if (!valid_register(env, reg))
return CREOLE_REGISTER_OVERFLOW;
*w = env->reg[reg];
return CREOLE_STEP_CONTINUE;
}
static enum creole_run_ret read_val(struct creole_env *env,
struct creole_ins *ins,
unsigned arg,
creole_word *w)
{
if (get_type_from_high_bit(ins->w_flags[arg]) == CREOLE_REGISTER) {
return creole_reg_read(env, ins->w[arg], w);
} else {
*w = ins->w[arg];
}
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_push(struct creole_env *env, creole_word w)
{
if (env->stkptr == env->stklen)
return CREOLE_STACK_OVERFLOW;
env->stk[env->stkptr++] = w;
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_pop(struct creole_env *env, creole_word *w)
{
if (env->stkptr == 0)
return CREOLE_STACK_UNDERFLOW;
*w = env->stk[--env->stkptr];
return CREOLE_STEP_CONTINUE;
}
static enum creole_run_ret
check_label(struct creole_env *env, creole_word label)
{
return label < env->lablen
? CREOLE_STEP_CONTINUE
: CREOLE_RUN_LABEL_OVERFLOW;
}
enum argument_signed {
ALL_UNSIGNED = 0, /* 0b00 */
FIRST_SIGNED = 2, /* 0b10 */
SECOND_SIGNED = 1, /* 0b01 */
ALL_SIGNED = 3 /* 0b11 */
};
static enum argument_signed check_sign_bits(unsigned flags1, unsigned flags2)
{
return (flags1 & 0x2) | ((flags2 & 0x2) >> 1);
}
#define check(fun) do { \
rcode = fun; \
if (rcode != CREOLE_STEP_CONTINUE) \
return rcode; \
} while(0)
#define chk_sign_op(OPER) do { \
switch (check_sign_bits(ins->w_flags[1], ins->w_flags[2])) { \
case ALL_UNSIGNED: \
a1 = a1 OPER a2; \
break; \
case FIRST_SIGNED: \
a1 = (creole_signed)a1 OPER a2; \
break; \
case SECOND_SIGNED: \
a1 = a1 OPER (creole_signed)a2; \
break; \
case ALL_SIGNED: \
a1 = (creole_signed) a1 OPER (creole_signed) a2; \
break; \
default: \
return CREOLE_STEP_HIGH_BIT_MALFORMED; \
} \
} while(0)
enum creole_run_ret creole_step(struct creole_env *env, creole_word *sc)
{
struct creole_ins *ins = env->prg + env->prgptr;
creole_word a1, a2;
int rcode = CREOLE_STEP_CONTINUE;
int increase_pointer = 1;
if (env->prgptr == env->prgend)
return CREOLE_STEP_STOP;
switch (ins->opcode) {
case CREOLE_PUSH:
check(read_val(env, ins, 0, &a1));
check(creole_push(env, a1));
break;
case CREOLE_POP:
check(creole_pop(env, &a1));
check(creole_reg_write(env, ins->w[0], a1));
break;
case CREOLE_ADD:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(creole_reg_write(env, ins->w[0], a1 + a2));
break;
case CREOLE_MUL:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(creole_reg_write(env, ins->w[0], a1 * a2));
break;
case CREOLE_DIV:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
if (a2 == 0) {
return CREOLE_DIV_BY_ZERO;
}
chk_sign_op(/);
check(creole_reg_write(env, ins->w[0], a1));
break;
case CREOLE_SYS:
check(read_val(env, ins, 0, sc));
rcode = CREOLE_STEP_SYSCALL;
break;
case CREOLE_JL:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
chk_sign_op(<);
if (a1) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JLE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
chk_sign_op(<=);
if (a1) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
if (a1 == a2) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JNE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
if (a1 != a2) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
default:
rcode = CREOLE_STEP_UNKNOWN_OPCODE;
}
if (increase_pointer)
env->prgptr++;
return rcode;
}
#undef check