aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Peter McGoron 2023-02-07 04:38:39 +0000
committerGravatar Peter McGoron 2023-02-07 04:38:39 +0000
commitb053379db2cc84a25852126a2b06e9abbe184937 (patch)
treece315fe5b17a2c5d2c9f31800278d402c6398b2d
parentprototype bytecode interpreter (diff)
encode
-rw-r--r--Makefile2
-rw-r--r--creole.c109
-rw-r--r--test_creole.c176
3 files changed, 263 insertions, 24 deletions
diff --git a/Makefile b/Makefile
index aaa56f5..2b34dfb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,2 @@
test_creole: test_creole.c creole.c creole.h greatest.h
- $(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole
+ $(CC) -g test_creole.c -Wall -pedantic -std=c89 -o test_creole
diff --git a/creole.c b/creole.c
index 4259e42..58b6c83 100644
--- a/creole.c
+++ b/creole.c
@@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r)
return r->left == 0;
}
-#if 0
-
/*************************************************************************
* Pseudo-UTF-8 lexing
*
@@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r)
*
* Possible values:
* 0xxxxxxx (7 bits)
- * 110xxxxx 10xxxxxx (11 bits)
- * 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
- * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
- * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
- * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
- * 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
+ * 110HHHHx 10xxxxxx (11 bits)
+ * 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
+ * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
+ * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
+ * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
+ * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
@@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r)
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
+#define MAX_HIGH_BITS 15
struct word {
int len;
int high_bits;
@@ -93,14 +92,14 @@ struct word {
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
-static int read_continue(struct creole_reader *r, struct encoded_word *w,
+static int read_continue(struct creole_reader *r, struct word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
- for (i = 0; i < to_read) {
+ for (i = 0; i < to_read; i++) {
r_ret = read(r);
if (r_ret < 0)
return 0;
@@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w,
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2)
return 0;
- w->word = w->word << 6 | (c & 0x6);
+ w->word = w->word << 6 | (c & 0x3F);
}
return 1;
@@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w)
* bits to shift.
*/
unsigned char mask;
+
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
@@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w)
* high-bit flags in them.
*/
int to_read;
- } start_data[START_BYTE_NUM] {
+ } start_data[START_BYTE_NUM-1] = {
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
- {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
- {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
- {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
- {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
- {0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
- /* The single byte sequence has no high bits. */
- {0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
+ {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */
+ {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */
+ {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */
+ {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */
+ {0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */
};
int i;
- for (i = 0; i < START_BYTE_NUM; i++) {
+ for (i = 0; i < START_BYTE_NUM-1; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
@@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w)
return start_data[i].to_read;
}
}
+ /* i == 7 */
+ if (c >> 7 == 0) {
+ w->len = 1;
+ w->word = c;
+ w->high_bits = 0;
+ return 0;
+ }
return -1;
}
@@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w)
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
- unsigned char c;
int to_read;
r_ret = read(r);
@@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w)
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
}
- return read_continue(r, decoded_word, to_read);
+ return read_continue(r, w, to_read);
+}
+
+int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
+ unsigned char buf[7])
+{
+ static const struct {
+ creole_word max;
+ unsigned char b1_mask;
+ int high_bit_shift_b1;
+ int high_bit_shift_to_right_b1;
+ int data_shift_b1;
+
+ int high_bit_mask_b2;
+ int high_bit_shift_b2;
+ unsigned char b2_data_mask;
+ } d[] = {
+ {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
+ {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
+ {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
+ {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
+ {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
+ {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
+ };
+ unsigned lb;
+ unsigned j;
+
+ if (encode_to > 8)
+ return 0;
+
+ if (encode_to == 1) {
+ if (i < 0x80) {
+ buf[0] = i;
+ return 1;
+ }
+ return 0;
+ }
+
+ lb = encode_to - 2;
+ if (i > d[lb].max)
+ return 0;
+
+ buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
+ << d[lb].high_bit_shift_to_right_b1));
+ /* shifts greater than or equal to the bit size of a type are
+ * undefined. Data in the first byte is always aligned with the LSB.
+ */
+ if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
+ buf[0] |= i >> d[lb].data_shift_b1;
+
+ buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
+ << d[lb].high_bit_shift_b2)
+ | ((i >> ((encode_to - 2) * 6))
+ & d[lb].b2_data_mask);
+
+ for (j = 2; j < encode_to; j++) {
+ buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
+ }
+
+ return 1;
}
+#if 0
+
/*************************************************************************
* Parsing instructions
*
diff --git a/test_creole.c b/test_creole.c
index b8dc008..22a18b1 100644
--- a/test_creole.c
+++ b/test_creole.c
@@ -3,7 +3,7 @@ GREATEST_MAIN_DEFS();
#include "creole.c"
/**************************************************************************
- * Reader suite
+ * Reader
*************************************************************************/
#define reader_lit(r, s) do { \
r.p = (unsigned char *)s; \
@@ -37,8 +37,182 @@ SUITE(reader) {
RUN_TEST1(reader_test_basic, &r);
}
+/**************************************************************************
+ * Pseudo UTF-8 sequences
+ *************************************************************************/
+
+TEST no_values(void) {
+ struct creole_reader r;
+ struct word w;
+ r.p = NULL;
+ r.left = 0;
+ ASSERT_EQ(decode_seq(&r, &w), 0);
+
+ PASS();
+}
+
+struct seq {
+ creole_word max;
+ unsigned encode_to;
+ unsigned high_bits;
+
+ unsigned char minbuf[7];
+ unsigned char maxbuf[7];
+};
+
+void bprint(unsigned char c) {
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ printf("%u", (c >> (7 - i)) & 1);
+ }
+}
+
+void bprintb(unsigned char *b, int len) {
+ while (len-- > 0) {
+ bprint(*b++);
+ printf(" ");
+ }
+}
+
+TEST encode_byte_seq(struct seq *s) {
+ creole_word i = 0;
+ int j;
+ unsigned char buf[7];
+
+ for (;;) {
+ /*
+ printf("0x%X ", i);
+ bprintb(s->minbuf, s->encode_to);
+ printf("\n");
+ */
+
+ ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
+ buf), 1);
+ ASSERT_MEM_EQ(s->minbuf, buf, s->encode_to);
+
+ if (i == s->max)
+ break;
+ i++;
+
+ for (j = s->encode_to - 1; j > 0; j--) {
+ if (s->minbuf[j] == 0xBF) {
+ s->minbuf[j] = 0x80;
+ } else {
+ s->minbuf[j]++;
+ break;
+ }
+ }
+
+ if (j == 0)
+ s->minbuf[0]++;
+ }
+ ASSERT_MEM_EQ(s->maxbuf, s->minbuf, s->encode_to);
+ PASS();
+}
+
+TEST encode_decode_byte_seq(struct seq *s) {
+ unsigned char buf[7];
+ struct creole_reader r = {0};
+ struct word w;
+ creole_word i = 0;
+
+ for (;;) {
+ ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
+ buf), 1);
+ r.p = buf;
+ r.left = s->encode_to;
+ ASSERT_EQ(decode_seq(&r, &w), 1);
+ ASSERT_EQ(w.len, s->encode_to);
+ ASSERT_EQ(w.high_bits, s->high_bits);
+ ASSERT_EQ(w.word, i);
+
+ if (i == s->max)
+ break;
+ i++;
+ }
+
+ PASS();
+}
+
+SUITE(pseudo_utf8_encode_all) {
+ struct seq s;
+
+ RUN_TEST(no_values);
+
+ s.max = 0x7F;
+ s.encode_to = 1;
+ s.high_bits = 0;
+ s.minbuf[0] = 0x00;
+ s.maxbuf[0] = 0x7F;
+
+ RUN_TEST1(encode_byte_seq, &s);
+
+ for (s.high_bits = 0; s.high_bits < 16; s.high_bits++) {
+ memset(s.maxbuf, 0xBF, sizeof(s.maxbuf));
+
+ s.max = 0x7F;
+ s.encode_to = 2;
+ s.maxbuf[0] = s.minbuf[0] = 0xC0 | (s.high_bits << 1);
+ s.maxbuf[0] = 0xC1 | (s.high_bits << 1);
+ s.minbuf[1] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+
+ s.max = 0xFFF;
+ s.encode_to = 3;
+ s.minbuf[0] = 0xE0 | s.high_bits;
+ s.maxbuf[0] = 0xE0 | s.high_bits;
+ s.minbuf[1] = 0x80;
+ s.minbuf[2] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+
+ s.max = 0x1FFFF;
+ s.encode_to = 4;
+ s.maxbuf[0] = s.minbuf[0] = 0xF0 | (s.high_bits >> 1);
+ s.minbuf[1] = 0x80 | (s.high_bits & 0x1 << 5);
+ s.maxbuf[1] = 0x9F | (s.high_bits & 0x1 << 5);
+ s.minbuf[2] = 0x80;
+ s.minbuf[3] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+
+ s.max = 0x3FFFFF;
+ s.encode_to = 5;
+ s.maxbuf[0] = s.minbuf[0] = 0xF8 | (s.high_bits >> 2);
+ s.minbuf[1] = 0x80 | (s.high_bits & 0x3 << 4);
+ s.maxbuf[1] = 0x8F | (s.high_bits & 0x3 << 4);
+ s.minbuf[2] = 0x80;
+ s.minbuf[3] = 0x80;
+ s.minbuf[4] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+
+ s.max = 0x7FFFFFF;
+ s.encode_to = 6;
+ s.maxbuf[0] = s.minbuf[0] = 0xFC | (s.high_bits >> 3);
+ s.minbuf[1] = 0x80 | (s.high_bits & 0x7 << 3);
+ s.maxbuf[1] = 0x87 | (s.high_bits & 0x7 << 3);
+ s.minbuf[2] = 0x80;
+ s.minbuf[3] = 0x80;
+ s.minbuf[4] = 0x80;
+ s.minbuf[5] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+
+ s.max = 0xFFFFFFFF;
+ s.encode_to = 7;
+ s.maxbuf[0] = s.minbuf[0] = 0xFE;
+ s.minbuf[1] = 0x80 | (s.high_bits << 2);
+ s.maxbuf[1] = 0x83 | (s.high_bits << 2);
+ s.minbuf[2] = 0x80;
+ s.minbuf[3] = 0x80;
+ s.minbuf[4] = 0x80;
+ s.minbuf[5] = 0x80;
+ s.minbuf[6] = 0x80;
+ RUN_TEST1(encode_byte_seq, &s);
+ }
+}
+
int main(int argc, char *argv[]) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(reader);
+ RUN_SUITE(pseudo_utf8_encode_all);
GREATEST_MAIN_END();
}