aboutsummaryrefslogtreecommitdiffstats
path: root/creole.c
diff options
context:
space:
mode:
authorGravatar Peter McGoron 2023-02-07 04:38:39 +0000
committerGravatar Peter McGoron 2023-02-07 04:38:39 +0000
commitb053379db2cc84a25852126a2b06e9abbe184937 (patch)
treece315fe5b17a2c5d2c9f31800278d402c6398b2d /creole.c
parentprototype bytecode interpreter (diff)
encode
Diffstat (limited to 'creole.c')
-rw-r--r--creole.c109
1 files changed, 87 insertions, 22 deletions
diff --git a/creole.c b/creole.c
index 4259e42..58b6c83 100644
--- a/creole.c
+++ b/creole.c
@@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r)
return r->left == 0;
}
-#if 0
-
/*************************************************************************
* Pseudo-UTF-8 lexing
*
@@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r)
*
* Possible values:
* 0xxxxxxx (7 bits)
- * 110xxxxx 10xxxxxx (11 bits)
- * 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
- * 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
+ * 110HHHHx 10xxxxxx (11 bits)
+ * 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
+ * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
+ * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
+ * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
+ * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
@@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r)
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
+#define MAX_HIGH_BITS 15
struct word {
int len;
int high_bits;
@@ -93,14 +92,14 @@ struct word {
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
-static int read_continue(struct creole_reader *r, struct encoded_word *w,
+static int read_continue(struct creole_reader *r, struct word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
- for (i = 0; i < to_read) {
+ for (i = 0; i < to_read; i++) {
r_ret = read(r);
if (r_ret < 0)
return 0;
@@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w,
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2)
return 0;
- w->word = w->word << 6 | (c & 0x6);
+ w->word = w->word << 6 | (c & 0x3F);
}
return 1;
@@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w)
* bits to shift.
*/
unsigned char mask;
+
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
@@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w)
* high-bit flags in them.
*/
int to_read;
- } start_data[START_BYTE_NUM] {
+ } start_data[START_BYTE_NUM-1] = {
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
- {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
- {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
- {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
- {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
- {0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
- /* The single byte sequence has no high bits. */
- {0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
+ {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */
+ {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */
+ {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */
+ {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */
+ {0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */
};
int i;
- for (i = 0; i < START_BYTE_NUM; i++) {
+ for (i = 0; i < START_BYTE_NUM-1; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
@@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w)
return start_data[i].to_read;
}
}
+ /* i == 7 */
+ if (c >> 7 == 0) {
+ w->len = 1;
+ w->word = c;
+ w->high_bits = 0;
+ return 0;
+ }
return -1;
}
@@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w)
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
- unsigned char c;
int to_read;
r_ret = read(r);
@@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w)
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
}
- return read_continue(r, decoded_word, to_read);
+ return read_continue(r, w, to_read);
+}
+
+int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
+ unsigned char buf[7])
+{
+ static const struct {
+ creole_word max;
+ unsigned char b1_mask;
+ int high_bit_shift_b1;
+ int high_bit_shift_to_right_b1;
+ int data_shift_b1;
+
+ int high_bit_mask_b2;
+ int high_bit_shift_b2;
+ unsigned char b2_data_mask;
+ } d[] = {
+ {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
+ {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
+ {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
+ {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
+ {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
+ {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
+ };
+ unsigned lb;
+ unsigned j;
+
+ if (encode_to > 8)
+ return 0;
+
+ if (encode_to == 1) {
+ if (i < 0x80) {
+ buf[0] = i;
+ return 1;
+ }
+ return 0;
+ }
+
+ lb = encode_to - 2;
+ if (i > d[lb].max)
+ return 0;
+
+ buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
+ << d[lb].high_bit_shift_to_right_b1));
+ /* shifts greater than or equal to the bit size of a type are
+ * undefined. Data in the first byte is always aligned with the LSB.
+ */
+ if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
+ buf[0] |= i >> d[lb].data_shift_b1;
+
+ buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
+ << d[lb].high_bit_shift_b2)
+ | ((i >> ((encode_to - 2) * 6))
+ & d[lb].b2_data_mask);
+
+ for (j = 2; j < encode_to; j++) {
+ buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
+ }
+
+ return 1;
}
+#if 0
+
/*************************************************************************
* Parsing instructions
*