#include "creole.h" /************************************************************************* * Static information ************************************************************************/ /* Arguments to opcodes can accept the following: * immediate values only (as of now, no values are like this) * register values only (push, pop, etc.) * either values (math operations) * labels (jumps) * none (do not give an argument) */ enum creole_arg_type { TYPE_NONE, TYPE_IMM, TYPE_REG, TYPE_VAL, TYPE_LAB, CREOLE_ARG_TYPE_LEN }; /* C99+ allows for designating the array index when initializing arrays: [i] = v, * in C89 indicies are implicit from 0 to the maximum filled-in value. */ #define defop(s, n, a1, a2, a3) {n, {a1, a2, a3}} static const struct { unsigned arglen; enum creole_arg_type argtype[CREOLE_MAX_ARG]; } opcode_info[CREOLE_OPCODE_LEN] = { defop(NOOP, 0, TYPE_NONE, TYPE_NONE, TYPE_NONE), defop(PUSH, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE), defop(POP, 1, TYPE_REG, TYPE_NONE, TYPE_NONE), defop(ADD, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(MUL, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(DIV, 3, TYPE_REG, TYPE_VAL, TYPE_VAL), defop(JL, 3, TYPE_LAB, TYPE_VAL, TYPE_VAL), defop(CLB, 1, TYPE_LAB, TYPE_NONE, TYPE_NONE), defop(SYS, 1, TYPE_VAL, TYPE_NONE, TYPE_NONE) }; /************************************************************************* * Reading from the buffer ************************************************************************/ static int read(struct creole_reader *r) { if (r->left == 0) return -1; r->left--; return *r->p++; } static int read_eof(struct creole_reader *r) { return r->left == 0; } /************************************************************************* * Pseudo-UTF-8 lexing * * Pseudo-UTF-8 is based off of UTF-8 but adds more * bytes and allows (requires!) overlong encodings. * * Possible values: * 0xxxxxxx (7 bits) * 110HHHHx 10xxxxxx (11 bits) * 1110HHHH 10xxxxxx 10xxxxxx (16 bits) * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits) * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) * 10xxxxxx ************************************************************************/ /* A Psuedo-UTF-8 sequence can be either * * * A 1 byte sequence, where the lower 7 bits are the encoded * * word (no high bits). * * A multi-byte sequence where the 4 MSB are flags, and the * * lower bits are the encoded word. */ #define MAX_HIGH_BITS 15 struct word { int len; int high_bits; creole_word word; }; /* Decode a set of continuation bytes directly into the word. This assumes * that each continuation byte contains no high words. */ static int read_continue(struct creole_reader *r, struct word *w, int to_read) { int i; int r_ret; unsigned char c; for (i = 0; i < to_read; i++) { r_ret = read(r); if (r_ret < 0) { return 0; } /* Characters might not be 8 bits! */ c = (unsigned char)(r_ret & 0xFF); if (c >> 6 != 0x2) { return 0; } w->word = (w->word << 6) | (c & 0x3F); } return 1; } /* Start bytes must be treated differently. Depending on the scenario, * start bytes will contain parts of the encoded word and high-bit flags. * In some cases, not all of the high-bit flags are part of the start * byte. */ #define START_BYTE_NUM 7 static int parse_start_byte(unsigned char c, struct word *w) { static const struct { /* The algorithm compares the mask to the start byte * by shifting both to the right by the amount of 'x's * (blank spaces). The array is arranged in reverse * order so that the index indicates the amount of * bits to shift. */ unsigned char mask; /* The word bits, if they exist, always start from the * LSB, so there is no need to shift the bits away. The * word_mask gets the low bits. If there are no bits, set * to 0. */ unsigned char word_mask; /* The high bits may not start from the LSB. There needs * to be a shift to get the bits to the LSB, and a mask * to discard the higher bits. */ unsigned char high_bit_mask; int high_bit_shift; /* The amount of NORMAL continuation bytes to read. * This does NOT include continuation bytes that have * high-bit flags in them. */ int to_read; } start_data[START_BYTE_NUM-1] = { {0xFE, 0x00, 0x0, 0, 5}, /* 11111110 */ {0xFC, 0x00, 0x1, 0, 4}, /* 1111110x */ {0xF8, 0x00, 0x3, 0, 3}, /* 111110xx */ {0xF0, 0x00, 0x7, 0, 2}, /* 11110xxx */ {0xE0, 0x00, 0xF, 0, 2}, /* 1110xxxx */ {0xC0, 0x01, 0xF, 1, 1} /* 110xxxxx */ }; int i; for (i = 0; i < START_BYTE_NUM-1; i++) { if (c >> i == start_data[i].mask >> i) { w->len = START_BYTE_NUM - i; w->word = c & start_data[i].word_mask; w->high_bits = (c >> start_data[i].high_bit_shift) & start_data[i].high_bit_mask; return start_data[i].to_read; } } /* i == 7 */ if (c >> 7 == 0) { w->len = 1; w->word = c; w->high_bits = 0; return 0; } return -1; } /* This parses the first continuation byte if it is special. */ #define SPECIAL_CONTINUE_BYTE_NUM (START_BYTE_NUM - 3) static int parse_special_byte(unsigned char c, struct word *w) { /* The index denotes the amount of high bits that were in * the start byte. This is the amount that the stored value * must be shifted. * * The amount of bits that must be shifted out in the continue * byte increase with the index. The amount shifted is (i + 2). * * Each value stored in the array is the mask applied after * shifting the continue byte bits. */ static const unsigned char mask[SPECIAL_CONTINUE_BYTE_NUM] = { 0x1, /* 11110HHH 10Hxxxxx */ 0x3, /* 111110HH 10HHxxxx */ 0x7, /* 1111110H 10HHHxxx */ 0xF /* 11111110 10HHHHxx */ }; static const unsigned char wordmask[SPECIAL_CONTINUE_BYTE_NUM] = { 0x1F, 0xF, 0x7, 0x3 }; int i = w->len - 4; if (i >= SPECIAL_CONTINUE_BYTE_NUM) return 0; w->high_bits = (w->high_bits << (i + 1)) | ((c >> (5 - i)) & mask[i]); w->word = c & wordmask[i]; return 1; } /* Parse an entire Pseudo-UTF8 sequence. */ static int decode_seq(struct creole_reader *r, struct word *w) { int r_ret; int to_read; w->high_bits = 0; r_ret = read(r); if (r_ret < 0) return 0; to_read = parse_start_byte((unsigned char)(r_ret & 0xFF), w); if (to_read < 0) return 0; /* If to_read is not one less than w->len, that means there are * high bits in the first continuation byte. */ if (w->len - to_read > 1) { r_ret = read(r); if (r_ret < 0) return 0; if (!parse_special_byte((unsigned char)(r_ret & 0xFF), w)) return 0; } return read_continue(r, w, to_read); } int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits, unsigned char buf[7]) { static const struct { creole_word max; unsigned char b1_mask; int high_bit_shift_b1; int high_bit_shift_to_right_b1; int data_shift_b1; int high_bit_mask_b2; int high_bit_shift_b2; unsigned char b2_data_mask; } d[] = { {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */ {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */ {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */ {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */ {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */ {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */ }; int lb; unsigned j; if (encode_to > 8) return 0; if (encode_to == 1) { if (i < 0x80) { buf[0] = i; return 1; } return 0; } lb = encode_to - 2; if (i > d[lb].max) { return 0; } buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1 << d[lb].high_bit_shift_to_right_b1)); /* shifts greater than or equal to the bit size of a type are * undefined. Data in the first byte is always aligned with the LSB. */ if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT) buf[0] |= i >> d[lb].data_shift_b1; buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2) << d[lb].high_bit_shift_b2) | ((i >> ((encode_to - 2) * 6)) & d[lb].b2_data_mask); for (j = 2; j < encode_to; j++) { buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F); } return 1; } /************************************************************************* * Parsing instructions * * This parses an entire instruction, which is * a single byte sequence, * zero or more multibyte sequences, * one single byte of all zeros. *************************************************************************/ enum creole_compiler_ret creole_parse_line(struct creole_ins *ins, struct creole_reader *r) { struct word w = {0}; unsigned arg = 0; if (!decode_seq(r, &w)) return CREOLE_OPCODE_READ_ERROR; ins->opcode = w.word; if (w.word >= CREOLE_ARG_TYPE_LEN || w.len != 1) return CREOLE_OPCODE_MALFORMED; for (arg = 0; arg < opcode_info[ins->opcode].arglen; arg++) { if (!decode_seq(r, &w)) return CREOLE_ARG_READ_ERROR; if (w.len == 1) return CREOLE_ARG_MALFORMED; ins->w[arg] = w.word; ins->w_flags[arg] = w.high_bits; } if (!decode_seq(r, &w)) return CREOLE_LAST_READ_ERROR; if (w.word != 0 || w.len != 1) return CREOLE_LAST_MALFORMED; return CREOLE_COMPILE_OK; } /************************************************************************** * High level compiling interface *************************************************************************/ static int typecheck(enum creole_word_flag fl, enum creole_arg_type typ) { switch (typ) { case TYPE_IMM: return fl == CREOLE_IMMEDIATE; case TYPE_REG: return fl == CREOLE_REGISTER; case TYPE_VAL: return fl == CREOLE_IMMEDIATE || fl == CREOLE_REGISTER; case TYPE_LAB: return fl == CREOLE_IMMEDIATE; default: return 0; } } static enum creole_compiler_ret typecheck_ins(struct creole_env *env, struct creole_ins *ins) { unsigned i; for (i = 0; i < opcode_info[ins->opcode].arglen; i++) { if (!typecheck(ins->w_flags[i], opcode_info[ins->opcode].argtype[i])) return CREOLE_TYPE_ERROR; } return CREOLE_COMPILE_OK; } static void clear_ins(struct creole_ins *i) { i->opcode = 0; i->w[0] = i->w[1] = i->w[2] = i->w_flags[0] = i->w_flags[1] = i->w_flags[2] = 0; } static enum creole_compiler_ret handle_compiletime_immediate(struct creole_env *env, struct creole_ins *cur_ins) { switch (cur_ins->opcode) { case CREOLE_CLB: if (cur_ins->w[0] >= env->lablen) return CREOLE_LABEL_OVERFLOW; env->lab[cur_ins->w[0]] = env->prgptr; /* Delete instruction because it is a compile time * instruction. Place next instruction in its place. */ clear_ins(cur_ins); return CREOLE_COMPILE_CLEARED_INSTRUCTION; case CREOLE_NOOP: clear_ins(cur_ins); return CREOLE_COMPILE_CLEARED_INSTRUCTION; default: return typecheck_ins(env, cur_ins); } } enum creole_compiler_ret creole_compile(struct creole_env *env, struct creole_reader *r) { struct creole_ins *cur_ins = env->prg; int rcode; while (env->prgptr < env->prglen) { rcode = creole_parse_line(cur_ins, r); if (rcode != CREOLE_COMPILE_OK) return rcode; rcode = handle_compiletime_immediate(env, cur_ins); switch (rcode) { case CREOLE_COMPILE_CLEARED_INSTRUCTION: break; case CREOLE_COMPILE_OK: cur_ins++; env->prgptr++; break; default: return rcode; } if (read_eof(r)) break; } if (env->prgptr == env->prglen && !read_eof(r)) return CREOLE_PROGRAM_OVERFLOW; env->prgend = env->prgptr; env->prgptr = 0; return CREOLE_COMPILE_OK; } #if 0 static creole_word read_word(struct creole_ins *ins, int i) { if (env->w_flags[i] == CREOLE_REGISTER) return env->reg[env->w[i]]; else return env->w[i]; } int creole_step(struct creole_env *env) { struct creole_ins *ins = env->prg + env->prgptr; creole_word a1, a2; if (env->prgptr == env->prgend) return CREOLE_STEP_STOP; env->prgptr++; switch (ins->opcode) { case CREOLE_PUSH: if (env->stkptr == env->stklen) return CREOLE_STACK_OVERFLOW; env->stk[env->stkptr++] = env->reg[env->w[0]]; break; case CREOLE_POP: if (env->stkptr == 0) return CREOLE_STACK_OVERFLOW; env->reg[env->w[0]] = env->stk[--env->stkptr]; break; case CREOLE_ADD: a1 = read_word(ins, 1); a2 = read_word(ins, 2); env->reg[env->w[0]] = a1 + a2; break; case CREOLE_MUL: a1 = read_word(ins, 1); a2 = read_word(ins, 2); env->reg[env->w[0]] = a1 * a2; break; case CREOLE_DIV: a1 = read_word(ins, 1); a2 = read_word(ins, 2); env->reg[env->w[0]] = a1 / a2; break; case CREOLE_JL: a1 = read_word(ins, 1); a2 = read_word(ins, 2); if (a1 < a2) env->prgptr = env->lab[env->w[0]]; break; case SYS: a1 = read_word(ins, 1); /* do syscall */ break; } } #endif