From b053379db2cc84a25852126a2b06e9abbe184937 Mon Sep 17 00:00:00 2001 From: Peter McGoron Date: Tue, 7 Feb 2023 04:38:39 +0000 Subject: [PATCH] encode --- Makefile | 2 +- creole.c | 109 ++++++++++++++++++++++++------- test_creole.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 263 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index aaa56f5..2b34dfb 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,2 @@ test_creole: test_creole.c creole.c creole.h greatest.h - $(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole + $(CC) -g test_creole.c -Wall -pedantic -std=c89 -o test_creole diff --git a/creole.c b/creole.c index 4259e42..58b6c83 100644 --- a/creole.c +++ b/creole.c @@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r) return r->left == 0; } -#if 0 - /************************************************************************* * Pseudo-UTF-8 lexing * @@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r) * * Possible values: * 0xxxxxxx (7 bits) - * 110xxxxx 10xxxxxx (11 bits) - * 1110xxxx 10xxxxxx 10xxxxxx (16 bits) - * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits) - * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) - * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) - * 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) + * 110HHHHx 10xxxxxx (11 bits) + * 1110HHHH 10xxxxxx 10xxxxxx (16 bits) + * 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits) + * 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits) + * 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits) + * 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits) * 10xxxxxx ************************************************************************/ @@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r) * * A multi-byte sequence where the 4 MSB are flags, and the * * lower bits are the encoded word. */ +#define MAX_HIGH_BITS 15 struct word { int len; int high_bits; @@ -93,14 +92,14 @@ struct word { /* Decode a set of continuation bytes directly into the word. This assumes * that each continuation byte contains no high words. */ -static int read_continue(struct creole_reader *r, struct encoded_word *w, +static int read_continue(struct creole_reader *r, struct word *w, int to_read) { int i; int r_ret; unsigned char c; - for (i = 0; i < to_read) { + for (i = 0; i < to_read; i++) { r_ret = read(r); if (r_ret < 0) return 0; @@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w, c = (unsigned char)(r_ret & 0xFF); if (c >> 6 != 0x2) return 0; - w->word = w->word << 6 | (c & 0x6); + w->word = w->word << 6 | (c & 0x3F); } return 1; @@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w) * bits to shift. */ unsigned char mask; + /* The word bits, if they exist, always start from the * LSB, so there is no need to shift the bits away. The * word_mask gets the low bits. If there are no bits, set @@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w) * high-bit flags in them. */ int to_read; - } start_data[START_BYTE_NUM] { + } start_data[START_BYTE_NUM-1] = { {0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */ - {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */, - {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */, - {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */, - {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */, - {0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */, - /* The single byte sequence has no high bits. */ - {0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */, + {0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */ + {0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */ + {0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */ + {0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */ + {0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */ }; int i; - for (i = 0; i < START_BYTE_NUM; i++) { + for (i = 0; i < START_BYTE_NUM-1; i++) { if (c >> i == start_data[i].mask >> i) { w->len = START_BYTE_NUM - i; w->word = c & start_data[i].word_mask; @@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w) return start_data[i].to_read; } } + /* i == 7 */ + if (c >> 7 == 0) { + w->len = 1; + w->word = c; + w->high_bits = 0; + return 0; + } return -1; } @@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w) static int decode_seq(struct creole_reader *r, struct word *w) { int r_ret; - unsigned char c; int to_read; r_ret = read(r); @@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w) parse_special_byte((unsigned char)(r_ret & 0xFF), w); } - return read_continue(r, decoded_word, to_read); + return read_continue(r, w, to_read); } +int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits, + unsigned char buf[7]) +{ + static const struct { + creole_word max; + unsigned char b1_mask; + int high_bit_shift_b1; + int high_bit_shift_to_right_b1; + int data_shift_b1; + + int high_bit_mask_b2; + int high_bit_shift_b2; + unsigned char b2_data_mask; + } d[] = { + {0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */ + {0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */ + {0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */ + {0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */ + {0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */ + {0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */ + }; + unsigned lb; + unsigned j; + + if (encode_to > 8) + return 0; + + if (encode_to == 1) { + if (i < 0x80) { + buf[0] = i; + return 1; + } + return 0; + } + + lb = encode_to - 2; + if (i > d[lb].max) + return 0; + + buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1 + << d[lb].high_bit_shift_to_right_b1)); + /* shifts greater than or equal to the bit size of a type are + * undefined. Data in the first byte is always aligned with the LSB. + */ + if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT) + buf[0] |= i >> d[lb].data_shift_b1; + + buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2) + << d[lb].high_bit_shift_b2) + | ((i >> ((encode_to - 2) * 6)) + & d[lb].b2_data_mask); + + for (j = 2; j < encode_to; j++) { + buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F); + } + + return 1; +} + +#if 0 + /************************************************************************* * Parsing instructions * diff --git a/test_creole.c b/test_creole.c index b8dc008..22a18b1 100644 --- a/test_creole.c +++ b/test_creole.c @@ -3,7 +3,7 @@ GREATEST_MAIN_DEFS(); #include "creole.c" /************************************************************************** - * Reader suite + * Reader *************************************************************************/ #define reader_lit(r, s) do { \ r.p = (unsigned char *)s; \ @@ -37,8 +37,182 @@ SUITE(reader) { RUN_TEST1(reader_test_basic, &r); } +/************************************************************************** + * Pseudo UTF-8 sequences + *************************************************************************/ + +TEST no_values(void) { + struct creole_reader r; + struct word w; + r.p = NULL; + r.left = 0; + ASSERT_EQ(decode_seq(&r, &w), 0); + + PASS(); +} + +struct seq { + creole_word max; + unsigned encode_to; + unsigned high_bits; + + unsigned char minbuf[7]; + unsigned char maxbuf[7]; +}; + +void bprint(unsigned char c) { + int i; + + for (i = 0; i < 8; i++) { + printf("%u", (c >> (7 - i)) & 1); + } +} + +void bprintb(unsigned char *b, int len) { + while (len-- > 0) { + bprint(*b++); + printf(" "); + } +} + +TEST encode_byte_seq(struct seq *s) { + creole_word i = 0; + int j; + unsigned char buf[7]; + + for (;;) { + /* + printf("0x%X ", i); + bprintb(s->minbuf, s->encode_to); + printf("\n"); + */ + + ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits, + buf), 1); + ASSERT_MEM_EQ(s->minbuf, buf, s->encode_to); + + if (i == s->max) + break; + i++; + + for (j = s->encode_to - 1; j > 0; j--) { + if (s->minbuf[j] == 0xBF) { + s->minbuf[j] = 0x80; + } else { + s->minbuf[j]++; + break; + } + } + + if (j == 0) + s->minbuf[0]++; + } + ASSERT_MEM_EQ(s->maxbuf, s->minbuf, s->encode_to); + PASS(); +} + +TEST encode_decode_byte_seq(struct seq *s) { + unsigned char buf[7]; + struct creole_reader r = {0}; + struct word w; + creole_word i = 0; + + for (;;) { + ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits, + buf), 1); + r.p = buf; + r.left = s->encode_to; + ASSERT_EQ(decode_seq(&r, &w), 1); + ASSERT_EQ(w.len, s->encode_to); + ASSERT_EQ(w.high_bits, s->high_bits); + ASSERT_EQ(w.word, i); + + if (i == s->max) + break; + i++; + } + + PASS(); +} + +SUITE(pseudo_utf8_encode_all) { + struct seq s; + + RUN_TEST(no_values); + + s.max = 0x7F; + s.encode_to = 1; + s.high_bits = 0; + s.minbuf[0] = 0x00; + s.maxbuf[0] = 0x7F; + + RUN_TEST1(encode_byte_seq, &s); + + for (s.high_bits = 0; s.high_bits < 16; s.high_bits++) { + memset(s.maxbuf, 0xBF, sizeof(s.maxbuf)); + + s.max = 0x7F; + s.encode_to = 2; + s.maxbuf[0] = s.minbuf[0] = 0xC0 | (s.high_bits << 1); + s.maxbuf[0] = 0xC1 | (s.high_bits << 1); + s.minbuf[1] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + + s.max = 0xFFF; + s.encode_to = 3; + s.minbuf[0] = 0xE0 | s.high_bits; + s.maxbuf[0] = 0xE0 | s.high_bits; + s.minbuf[1] = 0x80; + s.minbuf[2] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + + s.max = 0x1FFFF; + s.encode_to = 4; + s.maxbuf[0] = s.minbuf[0] = 0xF0 | (s.high_bits >> 1); + s.minbuf[1] = 0x80 | (s.high_bits & 0x1 << 5); + s.maxbuf[1] = 0x9F | (s.high_bits & 0x1 << 5); + s.minbuf[2] = 0x80; + s.minbuf[3] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + + s.max = 0x3FFFFF; + s.encode_to = 5; + s.maxbuf[0] = s.minbuf[0] = 0xF8 | (s.high_bits >> 2); + s.minbuf[1] = 0x80 | (s.high_bits & 0x3 << 4); + s.maxbuf[1] = 0x8F | (s.high_bits & 0x3 << 4); + s.minbuf[2] = 0x80; + s.minbuf[3] = 0x80; + s.minbuf[4] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + + s.max = 0x7FFFFFF; + s.encode_to = 6; + s.maxbuf[0] = s.minbuf[0] = 0xFC | (s.high_bits >> 3); + s.minbuf[1] = 0x80 | (s.high_bits & 0x7 << 3); + s.maxbuf[1] = 0x87 | (s.high_bits & 0x7 << 3); + s.minbuf[2] = 0x80; + s.minbuf[3] = 0x80; + s.minbuf[4] = 0x80; + s.minbuf[5] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + + s.max = 0xFFFFFFFF; + s.encode_to = 7; + s.maxbuf[0] = s.minbuf[0] = 0xFE; + s.minbuf[1] = 0x80 | (s.high_bits << 2); + s.maxbuf[1] = 0x83 | (s.high_bits << 2); + s.minbuf[2] = 0x80; + s.minbuf[3] = 0x80; + s.minbuf[4] = 0x80; + s.minbuf[5] = 0x80; + s.minbuf[6] = 0x80; + RUN_TEST1(encode_byte_seq, &s); + } +} + int main(int argc, char *argv[]) { GREATEST_MAIN_BEGIN(); RUN_SUITE(reader); + RUN_SUITE(pseudo_utf8_encode_all); GREATEST_MAIN_END(); }