/* Copyright (c) 2023 Peter McGoron Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "creole.h" /************************************************************************* * Static information ************************************************************************/ /* Arguments to opcodes can accept the following: * immediate values only (as of now, no values are like this) * register values only (push, pop, etc.) * either values (math operations) * labels (jumps) * none (do not give an argument) */ enum creole_arg_type { TYPE_NONE, TYPE_IMM, TYPE_REG, TYPE_VAL, CREOLE_ARG_TYPE_LEN }; /* C99+ allows for designating the array index when initializing arrays: [i] = v, * in C89 indicies are implicit from 0 to the maximum filled-in value. */ #define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}} static const struct { int arglen; enum creole_arg_type argtype[CREOLE_MAX_ARG]; } opcode_info[CREOLE_OPCODE_LEN] = { defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE), defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE), defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE), defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE), defop(JL, 3, TYPE_IMM, TYPE_VAL, TYPE_VAL), defop(JLE, 3, TYPE_IMM, TYPE_VAL, TYPE_VAL), defop(JE, 3, TYPE_IMM, TYPE_VAL, TYPE_VAL), defop(JNE, 3, TYPE_IMM, TYPE_VAL, TYPE_VAL), defop(DB, 1, TYPE_IMM, TYPE_NONE, TYPE_NONE) }; /************************************************************************* * Reading from the buffer ************************************************************************/ static int read(struct creole_reader *r) { if (r->left == 0) return -1; r->left--; return *r->p++; } static int read_eof(struct creole_reader *r) { return r->left == 0; } /************************************************************************* * Pseudo-UTF-8 lexing * * Pseudo-UTF-8 is based off of UTF-8 but adds more * bytes and allows (requires!) overlong encodings. * * Possible values: * 0xxxxxxx (7 bits) * 110HHHHx 10xxxxxx (11 bits) * 1110HHHH 10xxxxxx 10xxxxxx (16 bits) * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits) * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) * 10xxxxxx ************************************************************************/ /* A Psuedo-UTF-8 sequence can be either * * * A 1 byte sequence, where the lower 7 bits are the encoded * * word (no high bits). * * A multi-byte sequence where the 4 MSB are flags, and the * * lower bits are the encoded word. */ #define MAX_HIGH_BITS 15 /* Decode a set of continuation bytes directly into the word. This assumes * that each continuation byte contains no high words. */ static int read_continue(struct creole_reader *r, struct creole_word *w, int to_read) { int i; int r_ret; unsigned char c; for (i = 0; i < to_read; i++) { r_ret = read(r); if (r_ret < 0) { return 0; } /* Characters might not be 8 bits! */ c = (unsigned char)(r_ret & 0xFF); if (c >> 6 != 0x2) { return 0; } w->word = (w->word << 6) | (c & 0x3F); } return 1; } /* Start bytes must be treated differently. Depending on the scenario, * start bytes will contain parts of the encoded word and high-bit flags. * In some cases, not all of the high-bit flags are part of the start * byte. */ #define START_BYTE_NUM 7 static int parse_start_byte(unsigned char c, struct creole_word *w) { static const struct { /* The algorithm compares the mask to the start byte * by shifting both to the right by the amount of 'x's * (blank spaces). The array is arranged in reverse * order so that the index indicates the amount of * bits to shift. */ unsigned char mask; /* The word bits, if they exist, always start from the * LSB, so there is no need to shift the bits away. The * word_mask gets the low bits. If there are no bits, set * to 0. */ unsigned char word_mask; /* The high bits may not start from the LSB. There needs * to be a shift to get the bits to the LSB, and a mask * to discard the higher bits. */ unsigned char high_bit_mask; int high_bit_shift; /* The amount of NORMAL continuation bytes to read. * This does NOT include continuation bytes that have * high-bit flags in them. */ int to_read; } start_data[START_BYTE_NUM-1] = { {0xFE, 0x00, 0x0, 0, 5}, /* 11111110 */ {0xFC, 0x00, 0x1, 0, 4}, /* 1111110x */ {0xF8, 0x00, 0x3, 0, 3}, /* 111110xx */ {0xF0, 0x00, 0x7, 0, 2}, /* 11110xxx */ {0xE0, 0x00, 0xF, 0, 2}, /* 1110xxxx */ {0xC0, 0x01, 0xF, 1, 1} /* 110xxxxx */ }; int i; for (i = 0; i < START_BYTE_NUM-1; i++) { if (c >> i == start_data[i].mask >> i) { w->len = START_BYTE_NUM - i; w->word = c & start_data[i].word_mask; w->high_bits = (c >> start_data[i].high_bit_shift) & start_data[i].high_bit_mask; return start_data[i].to_read; } } /* i == 7 */ if (c >> 7 == 0) { w->len = 1; w->word = c; w->high_bits = 0; return 0; } return -1; } /* This parses the first continuation byte if it is special. */ #define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3) static int parse_special_byte(unsigned char c, struct creole_word *w) { /* The index denotes the amount of high bits that were in * the start byte. This is the amount that the stored value * must be shifted. * * The amount of bits that must be shifted out in the continue * byte increase with the index. The amount shifted is (i + 2). * * Each value stored in the array is the mask applied after * shifting the continue byte bits. */ static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = { 0x1, /* 11110HHH 10Hxxxxx */ 0x3, /* 111110HH 10HHxxxx */ 0x7, /* 1111110H 10HHHxxx */ 0xF /* 11111110 10HHHHxx */ }; static const unsigned char wordmask[SPECIAL_CONTINUE_BYTE_NUM] = { 0x1F, 0xF, 0x7, 0x3 }; int i = w->len - 4; if (i >= SPECIAL_CONTINUE_BYTE_NUM) return 0; w->high_bits = (w->high_bits << (i + 1)) | ((c >> (5 - i)) & mask[i]); w->word = c & wordmask[i]; return 1; } /* Parse an entire Pseudo-UTF8 sequence. */ int creole_decode(struct creole_reader *r, struct creole_word *w) { int r_ret; int to_read; w->high_bits = 0; r_ret = read(r); if (r_ret < 0) return 0; to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w); if (to_read < 0) return 0; /* If to_read is not one less than w->len, that means there are * high bits in the first continuation byte. */ if (w->len - to_read > 1) { r_ret = read(r); if (r_ret < 0) return 0; if (!parse_special_byte((unsigned char)(r_ret & 0xFF), w)) return 0; } return read_continue(r, w, to_read); } int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits, unsigned char buf[7]) { static const struct { creole_word max; unsigned char b1_mask; int high_bit_shift_b1; int high_bit_shift_to_right_b1; int data_shift_b1; int high_bit_mask_b2; int high_bit_shift_b2; unsigned char b2_data_mask; } d[] = { {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */ {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */ {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */ {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */ {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */ {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */ }; int lb; unsigned j; if (encode_to > 8) return 0; if (encode_to == 1) { if (i < 0x80) { buf[0] = i; return 1; } return 0; } lb = encode_to - 2; if (i > d[lb].max) { return 0; } buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1 << d[lb].high_bit_shift_to_right_b1)); /* shifts greater than or equal to the bit size of a type are * undefined. Data in the first byte is always aligned with the LSB. */ if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT) buf[0] |= i >> d[lb].data_shift_b1; buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2) << d[lb].high_bit_shift_b2) | ((i >> ((encode_to - 2) * 6)) & d[lb].b2_data_mask); for (j = 2; j < encode_to; j++) { buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F); } return 1; } /************************************************************************* * Parsing instructions * * This parses an entire instruction, which is * a single byte sequence, * zero or more multibyte sequences, * one single byte of all zeros. *************************************************************************/ struct ins { unsigned char *start; unsigned char *datapt; enum creole_opcode opcode; creole_word w[CREOLE_MAX_ARG]; creole_word w_flags[CREOLE_MAX_ARG]; }; static int valid_register(struct creole_env *env, int reg) { return reg < env->reglen; } static int typecheck_arg(struct creole_env *env, int val, enum creole_word_flag fl, enum creole_arg_type typ) { switch (typ) { case TYPE_IMM: return fl == CREOLE_IMMEDIATE; case TYPE_REG: return fl == CREOLE_REGISTER && valid_register(env, val); case TYPE_VAL: return fl == CREOLE_IMMEDIATE || fl == CREOLE_REGISTER; default: return 0; } } static enum creole_word_flag arg_get_type(unsigned high_bits) { if (high_bits & 1) { return CREOLE_REGISTER; } else { return CREOLE_IMMEDIATE; } } static enum creole_compiler_ret parse_line(struct creole_env *env, struct ins *ins, struct creole_reader *r) { struct creole_word w = {0}; int i; ins->start = r->p; if (!creole_decode(r, &w)) return CREOLE_OPCODE_READ_ERROR; ins->opcode = w.word; if (w.word >= CREOLE_OPCODE_LEN || w.len != 1) { return CREOLE_OPCODE_MALFORMED; } if (opcode_info[ins->opcode].arglen > CREOLE_MAX_ARG) return CREOLE_OPCODE_MALFORMED; for (i = 0; i < opcode_info[ins->opcode].arglen; i++) { if (!creole_decode(r, &w)) return CREOLE_ARG_READ_ERROR; if (w.len == 1) return CREOLE_ARG_MALFORMED; ins->w[i] = w.word; ins->w_flags[i] = w.high_bits; if (!typecheck_arg(env, ins->w[i], arg_get_type(ins->w_flags[i]), opcode_info[ins->opcode].argtype[i])) return CREOLE_TYPE_ERROR; } if (ins->opcode == CREOLE_DB) { ins->datapt = r->p; do { if (!creole_decode(r, &w)) return CREOLE_ARG_READ_ERROR; } while (w.len != 1); if (w.word != 0) return CREOLE_LAST_READ_ERROR; return CREOLE_COMPILE_OK; } ins->datapt = NULL; if (!creole_decode(r, &w)) return CREOLE_LAST_READ_ERROR; if (w.word != 0 || w.len != 1) return CREOLE_LAST_MALFORMED; return CREOLE_COMPILE_OK; } /************************************************************************** * High level compiling interface *************************************************************************/ static void add_to_env(struct creole_env *env, struct ins *ins) { switch (ins->opcode) { case CREOLE_DB: env->dats[ins->w[0]] = ins->datapt; break; default: ; } } enum creole_compiler_ret creole_compile(struct creole_env *env) { struct ins ins = {0}; int rcode; env->r_current = env->r_start; while (!read_eof(&env->r_current)) { rcode = parse_line(env, &ins, &env->r_current); if (rcode != CREOLE_COMPILE_OK) return rcode; add_to_env(env, &ins); } env->r_current = env->r_start; return CREOLE_COMPILE_OK; } /************************************************************************** * Running and interaction interface *************************************************************************/ enum creole_run_ret creole_reg_write(struct creole_env *env, unsigned reg, creole_word w) { if (!valid_register(env, reg)) { return CREOLE_REGISTER_OVERFLOW; } env->reg[reg] = w; return CREOLE_STEP_CONTINUE; } enum creole_run_ret creole_reg_read(struct creole_env *env, unsigned reg, creole_word *w) { if (!valid_register(env, reg)) return CREOLE_REGISTER_OVERFLOW; *w = env->reg[reg]; return CREOLE_STEP_CONTINUE; } static enum creole_run_ret read_val(struct creole_env *env, struct ins *ins, unsigned arg, creole_word *w) { if (arg_get_type(ins->w_flags[arg]) == CREOLE_REGISTER) { return creole_reg_read(env, ins->w[arg], w); } else { *w = ins->w[arg]; } return CREOLE_STEP_CONTINUE; } enum creole_run_ret creole_push(struct creole_env *env, creole_word w) { if (env->stkptr == env->stklen) return CREOLE_STACK_OVERFLOW; env->stk[env->stkptr++] = w; return CREOLE_STEP_CONTINUE; } enum creole_run_ret creole_pop(struct creole_env *env, creole_word *w) { if (env->stkptr == 0) return CREOLE_STACK_UNDERFLOW; *w = env->stk[--env->stkptr]; return CREOLE_STEP_CONTINUE; } enum argument_signed { ALL_UNSIGNED = 0, /* 0b00 */ FIRST_SIGNED = 2, /* 0b10 */ SECOND_SIGNED = 1, /* 0b01 */ ALL_SIGNED = 3 /* 0b11 */ }; static enum argument_signed check_sign_bits(unsigned flags1, unsigned flags2) { return (flags1 & 0x2) | ((flags2 & 0x2) >> 1); } #define check(fun) do { \ rcode = fun; \ if (rcode != CREOLE_STEP_CONTINUE) \ return rcode; \ } while(0) int creole_jump(struct creole_env *env, creole_word off) { /* When env->r_start.left == off, this is the end of the program. */ if (env->r_start.left < off) return 0; env->r_current.p = env->r_start.p + off; env->r_current.left = env->r_start.left - off; return 1; } #define chk_sign_op(OPER) do { \ switch (check_sign_bits(ins.w_flags[1], ins.w_flags[2])) { \ case ALL_UNSIGNED: \ a1 = a1 OPER a2; \ break; \ case FIRST_SIGNED: \ a1 = (creole_signed)a1 OPER a2; \ break; \ case SECOND_SIGNED: \ a1 = a1 OPER (creole_signed)a2; \ break; \ case ALL_SIGNED: \ a1 = (creole_signed) a1 OPER (creole_signed) a2; \ break; \ default: \ return CREOLE_STEP_HIGH_BIT_MALFORMED; \ } \ } while(0) enum creole_run_ret creole_step(struct creole_env *env, creole_word *sc) { struct ins ins = {0}; creole_word a0, a1, a2; int rcode = CREOLE_STEP_CONTINUE; if (env->r_current.left == 0) return CREOLE_STEP_STOP; if (parse_line(env, &ins, &env->r_current) != CREOLE_COMPILE_OK) return CREOLE_RUN_DECODE_ERROR; switch (ins.opcode) { case CREOLE_DB: env->dats[ins.w[0]] = ins.datapt; break; case CREOLE_PUSH: check(read_val(env, &ins, 0, &a1)); check(creole_push(env, a1)); break; case CREOLE_POP: check(creole_pop(env, &a1)); check(creole_reg_write(env, ins.w[0], a1)); break; case CREOLE_ADD: check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); check(creole_reg_write(env, ins.w[0], a1 + a2)); break; case CREOLE_MUL: check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); check(creole_reg_write(env, ins.w[0], a1 * a2)); break; case CREOLE_DIV: check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); if (a2 == 0) { return CREOLE_DIV_BY_ZERO; } chk_sign_op(/); check(creole_reg_write(env, ins.w[0], a1)); break; case CREOLE_SYS: check(read_val(env, &ins, 0, sc)); rcode = CREOLE_STEP_SYSCALL; break; case CREOLE_JL: check(read_val(env, &ins, 0, &a0)); check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); chk_sign_op(<); if (a1 && !creole_jump(env, a0)) return CREOLE_JUMP_OVERFLOW; break; case CREOLE_JLE: check(read_val(env, &ins, 0, &a0)); check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); chk_sign_op(<=); if (a1 && !creole_jump(env, a0)) return CREOLE_JUMP_OVERFLOW; break; case CREOLE_JE: check(read_val(env, &ins, 0, &a0)); check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); if (a1 == a2 && !creole_jump(env, a0)) return CREOLE_JUMP_OVERFLOW; break; case CREOLE_JNE: check(read_val(env, &ins, 0, &a0)); check(read_val(env, &ins, 1, &a1)); check(read_val(env, &ins, 2, &a2)); if (a1 != a2 && !creole_jump(env, a0)) return CREOLE_JUMP_OVERFLOW; break; default: rcode = CREOLE_STEP_UNKNOWN_OPCODE; } return rcode; } #undef check