diff options
| author | 2023-02-07 04:38:39 +0000 | |
|---|---|---|
| committer | 2023-02-07 04:38:39 +0000 | |
| commit | b053379db2cc84a25852126a2b06e9abbe184937 (patch) | |
| tree | ce315fe5b17a2c5d2c9f31800278d402c6398b2d /creole.c | |
| parent | prototype bytecode interpreter (diff) | |
encode
Diffstat (limited to 'creole.c')
| -rw-r--r-- | creole.c | 109 |
1 files changed, 87 insertions, 22 deletions
@@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r) return r->left == 0; } -#if 0 - /************************************************************************* * Pseudo-UTF-8 lexing * @@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r) * * Possible values: * 0xxxxxxx (7 bits) - * 110xxxxx 10xxxxxx (11 bits) - * 1110xxxx 10xxxxxx 10xxxxxx (16 bits) - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits) - * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) - * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) - * 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) + * 110HHHHx 10xxxxxx (11 bits) + * 1110HHHH 10xxxxxx 10xxxxxx (16 bits) + * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits) + * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) + * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) + * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) * 10xxxxxx ************************************************************************/ @@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r) * * A multi-byte sequence where the 4 MSB are flags, and the * * lower bits are the encoded word. */ +#define MAX_HIGH_BITS 15 struct word { int len; int high_bits; @@ -93,14 +92,14 @@ struct word { /* Decode a set of continuation bytes directly into the word. This assumes * that each continuation byte contains no high words. */ -static int read_continue(struct creole_reader *r, struct encoded_word *w, +static int read_continue(struct creole_reader *r, struct word *w, int to_read) { int i; int r_ret; unsigned char c; - for (i = 0; i < to_read) { + for (i = 0; i < to_read; i++) { r_ret = read(r); if (r_ret < 0) return 0; @@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w, c = (unsigned char)(r_ret & 0xFF); if (c >> 6 != 0x2) return 0; - w->word = w->word << 6 | (c & 0x6); + w->word = w->word << 6 | (c & 0x3F); } return 1; @@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w) * bits to shift. */ unsigned char mask; + /* The word bits, if they exist, always start from the * LSB, so there is no need to shift the bits away. The * word_mask gets the low bits. If there are no bits, set @@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w) * high-bit flags in them. */ int to_read; - } start_data[START_BYTE_NUM] { + } start_data[START_BYTE_NUM-1] = { {0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */ - {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */, - {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */, - {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */, - {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */, - {0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */, - /* The single byte sequence has no high bits. */ - {0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */, + {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */ + {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */ + {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */ + {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */ + {0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */ }; int i; - for (i = 0; i < START_BYTE_NUM; i++) { + for (i = 0; i < START_BYTE_NUM-1; i++) { if (c >> i == start_data[i].mask >> i) { w->len = START_BYTE_NUM - i; w->word = c & start_data[i].word_mask; @@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w) return start_data[i].to_read; } } + /* i == 7 */ + if (c >> 7 == 0) { + w->len = 1; + w->word = c; + w->high_bits = 0; + return 0; + } return -1; } @@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w) static int decode_seq(struct creole_reader *r, struct word *w) { int r_ret; - unsigned char c; int to_read; r_ret = read(r); @@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w) parse_special_byte((unsigned char)(r_ret & 0xFF), w); } - return read_continue(r, decoded_word, to_read); + return read_continue(r, w, to_read); +} + +int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits, + unsigned char buf[7]) +{ + static const struct { + creole_word max; + unsigned char b1_mask; + int high_bit_shift_b1; + int high_bit_shift_to_right_b1; + int data_shift_b1; + + int high_bit_mask_b2; + int high_bit_shift_b2; + unsigned char b2_data_mask; + } d[] = { + {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */ + {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */ + {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */ + {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */ + {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */ + {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */ + }; + unsigned lb; + unsigned j; + + if (encode_to > 8) + return 0; + + if (encode_to == 1) { + if (i < 0x80) { + buf[0] = i; + return 1; + } + return 0; + } + + lb = encode_to - 2; + if (i > d[lb].max) + return 0; + + buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1 + << d[lb].high_bit_shift_to_right_b1)); + /* shifts greater than or equal to the bit size of a type are + * undefined. Data in the first byte is always aligned with the LSB. + */ + if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT) + buf[0] |= i >> d[lb].data_shift_b1; + + buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2) + << d[lb].high_bit_shift_b2) + | ((i >> ((encode_to - 2) * 6)) + & d[lb].b2_data_mask); + + for (j = 2; j < encode_to; j++) { + buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F); + } + + return 1; } +#if 0 + /************************************************************************* * Parsing instructions * |
