This commit is contained in:
Peter McGoron 2023-02-07 04:38:39 +00:00
parent f63d6cdd3d
commit b053379db2
3 changed files with 263 additions and 24 deletions

View File

@ -1,2 +1,2 @@
test_creole: test_creole.c creole.c creole.h greatest.h
$(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole
$(CC) -g test_creole.c -Wall -pedantic -std=c89 -o test_creole

109
creole.c
View File

@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r)
return r->left == 0;
}
#if 0
/*************************************************************************
* Pseudo-UTF-8 lexing
*
@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r)
*
* Possible values:
* 0xxxxxxx (7 bits)
* 110xxxxx 10xxxxxx (11 bits)
* 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
* 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 110HHHHx 10xxxxxx (11 bits)
* 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
* 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
* 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
* 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
* 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
* 10xxxxxx
************************************************************************/
@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r)
* * A multi-byte sequence where the 4 MSB are flags, and the
* * lower bits are the encoded word.
*/
#define MAX_HIGH_BITS 15
struct word {
int len;
int high_bits;
@ -93,14 +92,14 @@ struct word {
/* Decode a set of continuation bytes directly into the word. This assumes
* that each continuation byte contains no high words.
*/
static int read_continue(struct creole_reader *r, struct encoded_word *w,
static int read_continue(struct creole_reader *r, struct word *w,
int to_read)
{
int i;
int r_ret;
unsigned char c;
for (i = 0; i < to_read) {
for (i = 0; i < to_read; i++) {
r_ret = read(r);
if (r_ret < 0)
return 0;
@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w,
c = (unsigned char)(r_ret & 0xFF);
if (c >> 6 != 0x2)
return 0;
w->word = w->word << 6 | (c & 0x6);
w->word = w->word << 6 | (c & 0x3F);
}
return 1;
@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w)
* bits to shift.
*/
unsigned char mask;
/* The word bits, if they exist, always start from the
* LSB, so there is no need to shift the bits away. The
* word_mask gets the low bits. If there are no bits, set
@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w)
* high-bit flags in them.
*/
int to_read;
} start_data[START_BYTE_NUM] {
} start_data[START_BYTE_NUM-1] = {
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
{0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
/* The single byte sequence has no high bits. */
{0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */
{0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */
};
int i;
for (i = 0; i < START_BYTE_NUM; i++) {
for (i = 0; i < START_BYTE_NUM-1; i++) {
if (c >> i == start_data[i].mask >> i) {
w->len = START_BYTE_NUM - i;
w->word = c & start_data[i].word_mask;
@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w)
return start_data[i].to_read;
}
}
/* i == 7 */
if (c >> 7 == 0) {
w->len = 1;
w->word = c;
w->high_bits = 0;
return 0;
}
return -1;
}
@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w)
static int decode_seq(struct creole_reader *r, struct word *w)
{
int r_ret;
unsigned char c;
int to_read;
r_ret = read(r);
@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w)
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
}
return read_continue(r, decoded_word, to_read);
return read_continue(r, w, to_read);
}
int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
unsigned char buf[7])
{
static const struct {
creole_word max;
unsigned char b1_mask;
int high_bit_shift_b1;
int high_bit_shift_to_right_b1;
int data_shift_b1;
int high_bit_mask_b2;
int high_bit_shift_b2;
unsigned char b2_data_mask;
} d[] = {
{0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
{0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
{0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
{0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
{0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
{0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
};
unsigned lb;
unsigned j;
if (encode_to > 8)
return 0;
if (encode_to == 1) {
if (i < 0x80) {
buf[0] = i;
return 1;
}
return 0;
}
lb = encode_to - 2;
if (i > d[lb].max)
return 0;
buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
<< d[lb].high_bit_shift_to_right_b1));
/* shifts greater than or equal to the bit size of a type are
* undefined. Data in the first byte is always aligned with the LSB.
*/
if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
buf[0] |= i >> d[lb].data_shift_b1;
buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
<< d[lb].high_bit_shift_b2)
| ((i >> ((encode_to - 2) * 6))
& d[lb].b2_data_mask);
for (j = 2; j < encode_to; j++) {
buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
}
return 1;
}
#if 0
/*************************************************************************
* Parsing instructions
*

View File

@ -3,7 +3,7 @@ GREATEST_MAIN_DEFS();
#include "creole.c"
/**************************************************************************
* Reader suite
* Reader
*************************************************************************/
#define reader_lit(r, s) do { \
r.p = (unsigned char *)s; \
@ -37,8 +37,182 @@ SUITE(reader) {
RUN_TEST1(reader_test_basic, &r);
}
/**************************************************************************
* Pseudo UTF-8 sequences
*************************************************************************/
TEST no_values(void) {
struct creole_reader r;
struct word w;
r.p = NULL;
r.left = 0;
ASSERT_EQ(decode_seq(&r, &w), 0);
PASS();
}
struct seq {
creole_word max;
unsigned encode_to;
unsigned high_bits;
unsigned char minbuf[7];
unsigned char maxbuf[7];
};
void bprint(unsigned char c) {
int i;
for (i = 0; i < 8; i++) {
printf("%u", (c >> (7 - i)) & 1);
}
}
void bprintb(unsigned char *b, int len) {
while (len-- > 0) {
bprint(*b++);
printf(" ");
}
}
TEST encode_byte_seq(struct seq *s) {
creole_word i = 0;
int j;
unsigned char buf[7];
for (;;) {
/*
printf("0x%X ", i);
bprintb(s->minbuf, s->encode_to);
printf("\n");
*/
ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
buf), 1);
ASSERT_MEM_EQ(s->minbuf, buf, s->encode_to);
if (i == s->max)
break;
i++;
for (j = s->encode_to - 1; j > 0; j--) {
if (s->minbuf[j] == 0xBF) {
s->minbuf[j] = 0x80;
} else {
s->minbuf[j]++;
break;
}
}
if (j == 0)
s->minbuf[0]++;
}
ASSERT_MEM_EQ(s->maxbuf, s->minbuf, s->encode_to);
PASS();
}
TEST encode_decode_byte_seq(struct seq *s) {
unsigned char buf[7];
struct creole_reader r = {0};
struct word w;
creole_word i = 0;
for (;;) {
ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
buf), 1);
r.p = buf;
r.left = s->encode_to;
ASSERT_EQ(decode_seq(&r, &w), 1);
ASSERT_EQ(w.len, s->encode_to);
ASSERT_EQ(w.high_bits, s->high_bits);
ASSERT_EQ(w.word, i);
if (i == s->max)
break;
i++;
}
PASS();
}
SUITE(pseudo_utf8_encode_all) {
struct seq s;
RUN_TEST(no_values);
s.max = 0x7F;
s.encode_to = 1;
s.high_bits = 0;
s.minbuf[0] = 0x00;
s.maxbuf[0] = 0x7F;
RUN_TEST1(encode_byte_seq, &s);
for (s.high_bits = 0; s.high_bits < 16; s.high_bits++) {
memset(s.maxbuf, 0xBF, sizeof(s.maxbuf));
s.max = 0x7F;
s.encode_to = 2;
s.maxbuf[0] = s.minbuf[0] = 0xC0 | (s.high_bits << 1);
s.maxbuf[0] = 0xC1 | (s.high_bits << 1);
s.minbuf[1] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
s.max = 0xFFF;
s.encode_to = 3;
s.minbuf[0] = 0xE0 | s.high_bits;
s.maxbuf[0] = 0xE0 | s.high_bits;
s.minbuf[1] = 0x80;
s.minbuf[2] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
s.max = 0x1FFFF;
s.encode_to = 4;
s.maxbuf[0] = s.minbuf[0] = 0xF0 | (s.high_bits >> 1);
s.minbuf[1] = 0x80 | (s.high_bits & 0x1 << 5);
s.maxbuf[1] = 0x9F | (s.high_bits & 0x1 << 5);
s.minbuf[2] = 0x80;
s.minbuf[3] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
s.max = 0x3FFFFF;
s.encode_to = 5;
s.maxbuf[0] = s.minbuf[0] = 0xF8 | (s.high_bits >> 2);
s.minbuf[1] = 0x80 | (s.high_bits & 0x3 << 4);
s.maxbuf[1] = 0x8F | (s.high_bits & 0x3 << 4);
s.minbuf[2] = 0x80;
s.minbuf[3] = 0x80;
s.minbuf[4] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
s.max = 0x7FFFFFF;
s.encode_to = 6;
s.maxbuf[0] = s.minbuf[0] = 0xFC | (s.high_bits >> 3);
s.minbuf[1] = 0x80 | (s.high_bits & 0x7 << 3);
s.maxbuf[1] = 0x87 | (s.high_bits & 0x7 << 3);
s.minbuf[2] = 0x80;
s.minbuf[3] = 0x80;
s.minbuf[4] = 0x80;
s.minbuf[5] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
s.max = 0xFFFFFFFF;
s.encode_to = 7;
s.maxbuf[0] = s.minbuf[0] = 0xFE;
s.minbuf[1] = 0x80 | (s.high_bits << 2);
s.maxbuf[1] = 0x83 | (s.high_bits << 2);
s.minbuf[2] = 0x80;
s.minbuf[3] = 0x80;
s.minbuf[4] = 0x80;
s.minbuf[5] = 0x80;
s.minbuf[6] = 0x80;
RUN_TEST1(encode_byte_seq, &s);
}
}
int main(int argc, char *argv[]) {
GREATEST_MAIN_BEGIN();
RUN_SUITE(reader);
RUN_SUITE(pseudo_utf8_encode_all);
GREATEST_MAIN_END();
}