creole/creole.c

654 lines
18 KiB
C

#include "creole.h"
/*************************************************************************
* Static information
************************************************************************/
/* Arguments to opcodes can accept the following:
* immediate values only (as of now, no values are like this)
* register values only (push, pop, etc.)
* either values (math operations)
* labels (jumps)
* none (do not give an argument)
*/
enum creole_arg_type {
TYPE_NONE,
TYPE_IMM,
TYPE_REG,
TYPE_VAL,
TYPE_LAB,
CREOLE_ARG_TYPE_LEN
};
/* C99+ allows for designating the array index when initializing arrays:
[i] = v,
* in C89 indicies are implicit from 0 to the maximum filled-in value.
*/
#define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}}
static const struct {
unsigned arglen;
enum creole_arg_type argtype[CREOLE_MAX_ARG];
} opcode_info[CREOLE_OPCODE_LEN] = {
defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE),
defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE),
defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL),
defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE),
defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE),
defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JLE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL),
defop(JNE, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL)
};
/*************************************************************************
* Reading from the buffer
************************************************************************/
static int read(struct creole_reader *r)
{
if (r->left == 0)
return -1;
r->left--;
return *r->p++;
}
static int read_eof(struct creole_reader *r)
{
return r->left == 0;
}
/*************************************************************************
* Pseudo-UTF-8 lexing
*
* Pseudo-UTF-8 is based off of UTF-8 but adds more
* bytes and allows (requires!) overlong encodings.
*
* Possible values:
* 0xxxxxxx (7 bits)
* 110HHHHx 10xxxxxx (11 bits)
* 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
* 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
* 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
* 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
* 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
/* A Psuedo-UTF-8 sequence can be either
*
* * A 1 byte sequence, where the lower 7 bits are the encoded
* * word (no high bits).
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
#define MAX_HIGH_BITS 15
struct word {
int len;
int high_bits;
creole_word word;
};
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
static int read_continue(struct creole_reader *r, struct word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
for (i = 0; i < to_read; i++) {
r_ret = read(r);
if (r_ret < 0) {
return 0;
}
/* Characters might not be 8 bits! */
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2) {
return 0;
}
w->word = (w->word << 6) | (c & 0x3F);
}
return 1;
}
/* Start bytes must be treated differently. Depending on the scenario,
* start bytes will contain parts of the encoded word and high-bit flags.
* In some cases, not all of the high-bit flags are part of the start
* byte.
*/
#define START_BYTE_NUM 7
static int parse_start_byte(unsigned char c, struct word *w)
{
static const struct {
/* The algorithm compares the mask to the start byte
* by shifting both to the right by the amount of 'x's
* (blank spaces). The array is arranged in reverse
* order so that the index indicates the amount of
* bits to shift.
*/
unsigned char mask;
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
* to 0.
*/
unsigned char word_mask;
/* The high bits may not start from the LSB. There needs
* to be a shift to get the bits to the LSB, and a mask
* to discard the higher bits.
*/
unsigned char high_bit_mask;
int high_bit_shift;
/* The amount of NORMAL continuation bytes to read.
* This does NOT include continuation bytes that have
* high-bit flags in them.
*/
int to_read;
} start_data[START_BYTE_NUM-1] = {
{0xFE, 0x00, 0x0, 0, 5}, /* 11111110 */
{0xFC, 0x00, 0x1, 0, 4}, /* 1111110x */
{0xF8, 0x00, 0x3, 0, 3}, /* 111110xx */
{0xF0, 0x00, 0x7, 0, 2}, /* 11110xxx */
{0xE0, 0x00, 0xF, 0, 2}, /* 1110xxxx */
{0xC0, 0x01, 0xF, 1, 1} /* 110xxxxx */
};
int i;
for (i = 0; i < START_BYTE_NUM-1; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
w->high_bits = (c >> start_data[i].high_bit_shift)
& start_data[i].high_bit_mask;
return start_data[i].to_read;
}
}
/* i == 7 */
if (c >> 7 == 0) {
w->len = 1;
w->word = c;
w->high_bits = 0;
return 0;
}
return -1;
}
/* This parses the first continuation byte if it is special. */
#define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3)
static int parse_special_byte(unsigned char c, struct word *w)
{
/* The index denotes the amount of high bits that were in
* the start byte. This is the amount that the stored value
* must be shifted.
*
* The amount of bits that must be shifted out in the continue
* byte increase with the index. The amount shifted is (i + 2).
*
* Each value stored in the array is the mask applied after
* shifting the continue byte bits.
*/
static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = {
0x1, /* 11110HHH 10Hxxxxx */
0x3, /* 111110HH 10HHxxxx */
0x7, /* 1111110H 10HHHxxx */
0xF /* 11111110 10HHHHxx */
};
static const unsigned char wordmask[SPECIAL_CONTINUE_BYTE_NUM] = {
0x1F, 0xF, 0x7, 0x3
};
int i = w->len - 4;
if (i >= SPECIAL_CONTINUE_BYTE_NUM)
return 0;
w->high_bits = (w->high_bits << (i + 1)) | ((c >> (5 - i)) & mask[i]);
w->word = c & wordmask[i];
return 1;
}
/* Parse an entire Pseudo-UTF8 sequence. */
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
int to_read;
w->high_bits = 0;
r_ret = read(r);
if (r_ret < 0)
return 0;
to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w);
if (to_read < 0)
return 0;
/* If to_read is not one less than w->len, that means there are
* high bits in the first continuation byte.
*/
if (w->len - to_read > 1) {
r_ret = read(r);
if (r_ret < 0)
return 0;
if (!parse_special_byte((unsigned char)(r_ret & 0xFF), w))
return 0;
}
return read_continue(r, w, to_read);
}
int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
unsigned char buf[7])
{
static const struct {
creole_word max;
unsigned char b1_mask;
int high_bit_shift_b1;
int high_bit_shift_to_right_b1;
int data_shift_b1;
int high_bit_mask_b2;
int high_bit_shift_b2;
unsigned char b2_data_mask;
} d[] = {
{0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
{0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
{0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
{0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
{0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
{0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
};
int lb;
unsigned j;
if (encode_to > 8)
return 0;
if (encode_to == 1) {
if (i < 0x80) {
buf[0] = i;
return 1;
}
return 0;
}
lb = encode_to - 2;
if (i > d[lb].max) {
return 0;
}
buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
<< d[lb].high_bit_shift_to_right_b1));
/* shifts greater than or equal to the bit size of a type are
* undefined. Data in the first byte is always aligned with the LSB.
*/
if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
buf[0] |= i >> d[lb].data_shift_b1;
buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
<< d[lb].high_bit_shift_b2)
| ((i >> ((encode_to - 2) * 6))
& d[lb].b2_data_mask);
for (j = 2; j < encode_to; j++) {
buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
}
return 1;
}
/*************************************************************************
* Parsing instructions
*
* This parses an entire instruction, which is
* a single byte sequence,
* zero or more multibyte sequences,
* one single byte of all zeros.
*************************************************************************/
enum creole_compiler_ret
creole_parse_line(struct creole_ins *ins, struct creole_reader *r)
{
struct word w = {0};
unsigned arg = 0;
if (!decode_seq(r, &w))
return CREOLE_OPCODE_READ_ERROR;
ins->opcode = w.word;
if (w.word >= CREOLE_OPCODE_LEN || w.len != 1) {
return CREOLE_OPCODE_MALFORMED;
}
for (arg = 0; arg < opcode_info[ins->opcode].arglen; arg++) {
if (!decode_seq(r, &w))
return CREOLE_ARG_READ_ERROR;
if (w.len == 1)
return CREOLE_ARG_MALFORMED;
ins->w[arg] = w.word;
ins->w_flags[arg] = w.high_bits;
}
if (!decode_seq(r, &w))
return CREOLE_LAST_READ_ERROR;
if (w.word != 0 || w.len != 1)
return CREOLE_LAST_MALFORMED;
return CREOLE_COMPILE_OK;
}
/**************************************************************************
* High level compiling interface
*************************************************************************/
static int valid_register(struct creole_env *env, int reg)
{
return reg < env->reglen;
}
static int valid_label(struct creole_env *env, int reg)
{
return reg < env->lablen;
}
static int typecheck(struct creole_env *env, int val,
enum creole_word_flag fl, enum creole_arg_type typ)
{
switch (typ) {
case TYPE_IMM: return fl == CREOLE_IMMEDIATE;
case TYPE_REG: return fl == CREOLE_REGISTER
&& valid_register(env, val);
case TYPE_VAL: return fl == CREOLE_IMMEDIATE
|| fl == CREOLE_REGISTER;
case TYPE_LAB: return fl == CREOLE_IMMEDIATE
&& valid_label(env, val);
default: return 0;
}
}
static enum creole_word_flag get_type_from_high_bit(unsigned high_bits)
{
if (high_bits & 1) {
return CREOLE_REGISTER;
} else {
return CREOLE_IMMEDIATE;
}
}
static enum creole_compiler_ret typecheck_ins(struct creole_env *env,
struct creole_ins *ins)
{
unsigned i;
for (i = 0; i < opcode_info[ins->opcode].arglen; i++) {
if (!typecheck(env, ins->w[i],
get_type_from_high_bit(ins->w_flags[i]),
opcode_info[ins->opcode].argtype[i]))
return CREOLE_TYPE_ERROR;
}
return CREOLE_COMPILE_OK;
}
static void clear_ins(struct creole_ins *i)
{
const struct creole_ins blank = {0};
*i = blank;
}
/****
* Get rid of instructions that can be written out at compile time.
***/
static enum creole_compiler_ret
handle_compiletime_immediate(struct creole_env *env,
struct creole_ins *cur_ins)
{
switch (cur_ins->opcode) {
case CREOLE_CLB:
if (cur_ins->w[0] >= env->lablen)
return CREOLE_LABEL_OVERFLOW;
env->lab[cur_ins->w[0]] = env->prgptr;
/* Delete instruction because it is a compile time
* instruction. Place next instruction in its place. */
clear_ins(cur_ins);
return CREOLE_COMPILE_CLEARED_INSTRUCTION;
case CREOLE_NOOP:
clear_ins(cur_ins);
return CREOLE_COMPILE_CLEARED_INSTRUCTION;
default:
return typecheck_ins(env, cur_ins);
}
}
/* TODO: The compile step can be completely removed in favor of directly
* executing the bytecode, disassembling it with creole_parse_line()
* at every instance. This will also make the implementation simpler.
*/
enum creole_compiler_ret
creole_compile(struct creole_env *env, struct creole_reader *r)
{
struct creole_ins *cur_ins = env->prg;
int rcode;
while (env->prgptr < env->prglen) {
rcode = creole_parse_line(cur_ins, r);
if (rcode != CREOLE_COMPILE_OK)
return rcode;
rcode = handle_compiletime_immediate(env, cur_ins);
switch (rcode) {
case CREOLE_COMPILE_CLEARED_INSTRUCTION:
break;
case CREOLE_COMPILE_OK:
cur_ins++;
env->prgptr++;
break;
default:
return rcode;
}
if (read_eof(r))
break;
}
if (env->prgptr == env->prglen && !read_eof(r))
return CREOLE_PROGRAM_OVERFLOW;
env->prgend = env->prgptr;
env->prgptr = 0;
return CREOLE_COMPILE_OK;
}
enum creole_run_ret creole_reg_write(struct creole_env *env, unsigned reg,
creole_word w)
{
if (!valid_register(env, reg)) {
return CREOLE_REGISTER_OVERFLOW;
}
env->reg[reg] = w;
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_reg_read(struct creole_env *env, unsigned reg,
creole_word *w)
{
if (!valid_register(env, reg))
return CREOLE_REGISTER_OVERFLOW;
*w = env->reg[reg];
return CREOLE_STEP_CONTINUE;
}
static enum creole_run_ret read_val(struct creole_env *env,
struct creole_ins *ins,
unsigned arg,
creole_word *w)
{
if (get_type_from_high_bit(ins->w_flags[arg]) == CREOLE_REGISTER) {
return creole_reg_read(env, ins->w[arg], w);
} else {
*w = ins->w[arg];
}
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_push(struct creole_env *env, creole_word w)
{
if (env->stkptr == env->stklen)
return CREOLE_STACK_OVERFLOW;
env->stk[env->stkptr++] = w;
return CREOLE_STEP_CONTINUE;
}
enum creole_run_ret creole_pop(struct creole_env *env, creole_word *w)
{
if (env->stkptr == 0)
return CREOLE_STACK_UNDERFLOW;
*w = env->stk[--env->stkptr];
return CREOLE_STEP_CONTINUE;
}
static enum creole_run_ret
check_label(struct creole_env *env, creole_word label)
{
return label < env->lablen
? CREOLE_STEP_CONTINUE
: CREOLE_RUN_LABEL_OVERFLOW;
}
enum argument_signed {
ALL_UNSIGNED = 0, /* 0b00 */
FIRST_SIGNED = 2, /* 0b10 */
SECOND_SIGNED = 1, /* 0b01 */
ALL_SIGNED = 3 /* 0b11 */
};
static enum argument_signed check_sign_bits(unsigned flags1, unsigned flags2)
{
return (flags1 & 0x2) | ((flags2 & 0x2) >> 1);
}
#define check(fun) do { \
rcode = fun; \
if (rcode != CREOLE_STEP_CONTINUE) \
return rcode; \
} while(0)
#define chk_sign_op(OPER) do { \
switch (check_sign_bits(ins->w_flags[1], ins->w_flags[2])) { \
case ALL_UNSIGNED: \
a1 = a1 OPER a2; \
break; \
case FIRST_SIGNED: \
a1 = (creole_signed)a1 OPER a2; \
break; \
case SECOND_SIGNED: \
a1 = a1 OPER (creole_signed)a2; \
break; \
case ALL_SIGNED: \
a1 = (creole_signed) a1 OPER (creole_signed) a2; \
break; \
default: \
return CREOLE_STEP_HIGH_BIT_MALFORMED; \
} \
} while(0)
enum creole_run_ret creole_step(struct creole_env *env, creole_word *sc)
{
struct creole_ins *ins = env->prg + env->prgptr;
creole_word a1, a2;
int rcode = CREOLE_STEP_CONTINUE;
int increase_pointer = 1;
if (env->prgptr == env->prgend)
return CREOLE_STEP_STOP;
switch (ins->opcode) {
case CREOLE_PUSH:
check(read_val(env, ins, 0, &a1));
check(creole_push(env, a1));
break;
case CREOLE_POP:
check(creole_pop(env, &a1));
check(creole_reg_write(env, ins->w[0], a1));
break;
case CREOLE_ADD:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(creole_reg_write(env, ins->w[0], a1 + a2));
break;
case CREOLE_MUL:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(creole_reg_write(env, ins->w[0], a1 * a2));
break;
case CREOLE_DIV:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
if (a2 == 0) {
return CREOLE_DIV_BY_ZERO;
}
chk_sign_op(/);
check(creole_reg_write(env, ins->w[0], a1));
break;
case CREOLE_SYS:
check(read_val(env, ins, 0, sc));
rcode = CREOLE_STEP_SYSCALL;
break;
case CREOLE_JL:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
chk_sign_op(<);
if (a1) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JLE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
chk_sign_op(<=);
if (a1) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
if (a1 == a2) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
case CREOLE_JNE:
check(read_val(env, ins, 1, &a1));
check(read_val(env, ins, 2, &a2));
check(check_label(env, ins->w[0]));
if (a1 != a2) {
env->prgptr = env->lab[ins->w[0]];
increase_pointer = 0;
}
break;
default:
rcode = CREOLE_STEP_UNKNOWN_OPCODE;
}
if (increase_pointer)
env->prgptr++;
return rcode;
}
#undef check