encode
This commit is contained in:
parent
f63d6cdd3d
commit
b053379db2
2
Makefile
2
Makefile
|
@ -1,2 +1,2 @@
|
|||
test_creole: test_creole.c creole.c creole.h greatest.h
|
||||
$(CC) test_creole.c -Wall -pedantic -std=c89 -o test_creole
|
||||
$(CC) -g test_creole.c -Wall -pedantic -std=c89 -o test_creole
|
||||
|
|
109
creole.c
109
creole.c
|
@ -57,8 +57,6 @@ static int read_eof(struct creole_reader *r)
|
|||
return r->left == 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
/*************************************************************************
|
||||
* Pseudo-UTF-8 lexing
|
||||
*
|
||||
|
@ -67,12 +65,12 @@ static int read_eof(struct creole_reader *r)
|
|||
*
|
||||
* Possible values:
|
||||
* 0xxxxxxx (7 bits)
|
||||
* 110xxxxx 10xxxxxx (11 bits)
|
||||
* 1110xxxx 10xxxxxx 10xxxxxx (16 bits)
|
||||
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (21 bits)
|
||||
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
|
||||
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
|
||||
* 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
|
||||
* 110HHHHx 10xxxxxx (11 bits)
|
||||
* 1110HHHH 10xxxxxx 10xxxxxx (16 bits)
|
||||
* 11110HHH 10Hxxxxx 10xxxxxx 10xxxxxx (21 bits)
|
||||
* 111110HH 10HHxxxx 10xxxxxx 10xxxxxx 10xxxxxx (26 bits)
|
||||
* 1111110H 10HHHxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (31 bits)
|
||||
* 11111110 10HHHHxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx (36 bits)
|
||||
* 10xxxxxx
|
||||
************************************************************************/
|
||||
|
||||
|
@ -84,6 +82,7 @@ static int read_eof(struct creole_reader *r)
|
|||
* * A multi-byte sequence where the 4 MSB are flags, and the
|
||||
* * lower bits are the encoded word.
|
||||
*/
|
||||
#define MAX_HIGH_BITS 15
|
||||
struct word {
|
||||
int len;
|
||||
int high_bits;
|
||||
|
@ -93,14 +92,14 @@ struct word {
|
|||
/* Decode a set of continuation bytes directly into the word. This assumes
|
||||
* that each continuation byte contains no high words.
|
||||
*/
|
||||
static int read_continue(struct creole_reader *r, struct encoded_word *w,
|
||||
static int read_continue(struct creole_reader *r, struct word *w,
|
||||
int to_read)
|
||||
{
|
||||
int i;
|
||||
int r_ret;
|
||||
unsigned char c;
|
||||
|
||||
for (i = 0; i < to_read) {
|
||||
for (i = 0; i < to_read; i++) {
|
||||
r_ret = read(r);
|
||||
if (r_ret < 0)
|
||||
return 0;
|
||||
|
@ -108,7 +107,7 @@ static int read_continue(struct creole_reader *r, struct encoded_word *w,
|
|||
c = (unsigned char)(r_ret & 0xFF);
|
||||
if (c >> 6 != 0x2)
|
||||
return 0;
|
||||
w->word = w->word << 6 | (c & 0x6);
|
||||
w->word = w->word << 6 | (c & 0x3F);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
@ -130,6 +129,7 @@ static int parse_start_byte(unsigned char c, struct word *w)
|
|||
* bits to shift.
|
||||
*/
|
||||
unsigned char mask;
|
||||
|
||||
/* The word bits, if they exist, always start from the
|
||||
* LSB, so there is no need to shift the bits away. The
|
||||
* word_mask gets the low bits. If there are no bits, set
|
||||
|
@ -149,20 +149,18 @@ static int parse_start_byte(unsigned char c, struct word *w)
|
|||
* high-bit flags in them.
|
||||
*/
|
||||
int to_read;
|
||||
} start_data[START_BYTE_NUM] {
|
||||
} start_data[START_BYTE_NUM-1] = {
|
||||
{0xFE, 0x00, 0, 0x0, 5}, /* 11111110 */
|
||||
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */,
|
||||
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */,
|
||||
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */,
|
||||
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */,
|
||||
{0xC0, 0x01, 1, 0xF, 1}, /* 110xxxxx */,
|
||||
/* The single byte sequence has no high bits. */
|
||||
{0x00, 0x7F, 0, 0x0, 0} /* 0xxxxxxx */,
|
||||
{0xFC, 0x00, 0, 0x1, 4}, /* 1111110x */
|
||||
{0xF8, 0x00, 0, 0x3, 3}, /* 111110xx */
|
||||
{0xF0, 0x00, 0, 0x7, 2}, /* 11110xxx */
|
||||
{0xE0, 0x00, 0, 0xF, 2}, /* 1110xxxx */
|
||||
{0xC0, 0x01, 1, 0xF, 1} /* 110xxxxx */
|
||||
};
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < START_BYTE_NUM; i++) {
|
||||
for (i = 0; i < START_BYTE_NUM-1; i++) {
|
||||
if (c >> i == start_data[i].mask >> i) {
|
||||
w->len = START_BYTE_NUM - i;
|
||||
w->word = c & start_data[i].word_mask;
|
||||
|
@ -171,6 +169,13 @@ static int parse_start_byte(unsigned char c, struct word *w)
|
|||
return start_data[i].to_read;
|
||||
}
|
||||
}
|
||||
/* i == 7 */
|
||||
if (c >> 7 == 0) {
|
||||
w->len = 1;
|
||||
w->word = c;
|
||||
w->high_bits = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
@ -203,7 +208,6 @@ static void parse_special_byte(unsigned char c, struct word *w)
|
|||
static int decode_seq(struct creole_reader *r, struct word *w)
|
||||
{
|
||||
int r_ret;
|
||||
unsigned char c;
|
||||
int to_read;
|
||||
|
||||
r_ret = read(r);
|
||||
|
@ -224,9 +228,70 @@ static int decode_seq(struct creole_reader *r, struct word *w)
|
|||
parse_special_byte((unsigned char)(r_ret & 0xFF), w);
|
||||
}
|
||||
|
||||
return read_continue(r, decoded_word, to_read);
|
||||
return read_continue(r, w, to_read);
|
||||
}
|
||||
|
||||
int creole_encode(creole_word i, unsigned encode_to, unsigned high_bits,
|
||||
unsigned char buf[7])
|
||||
{
|
||||
static const struct {
|
||||
creole_word max;
|
||||
unsigned char b1_mask;
|
||||
int high_bit_shift_b1;
|
||||
int high_bit_shift_to_right_b1;
|
||||
int data_shift_b1;
|
||||
|
||||
int high_bit_mask_b2;
|
||||
int high_bit_shift_b2;
|
||||
unsigned char b2_data_mask;
|
||||
} d[] = {
|
||||
{0x7F, 0xC0, 0, 1, 6, 0x0, 0, 0x3F}, /* 2 */
|
||||
{0xFFF, 0xE0, 0, 0, 12, 0x0, 0, 0x3F}, /* 3 */
|
||||
{0x1FFFF, 0xF0, 1, 0, 17, 0x1, 5, 0x1F}, /* 4 */
|
||||
{0x3FFFFF, 0xF8, 2, 0, 22, 0x3, 4, 0x0F}, /* 5 */
|
||||
{0x7FFFFFF, 0xFC, 3, 0, 27, 0x7, 3, 0x07}, /* 6 */
|
||||
{0xFFFFFFFF, 0xFE, 4, 0, 32, 0xF, 2, 0x03} /* 7 */
|
||||
};
|
||||
unsigned lb;
|
||||
unsigned j;
|
||||
|
||||
if (encode_to > 8)
|
||||
return 0;
|
||||
|
||||
if (encode_to == 1) {
|
||||
if (i < 0x80) {
|
||||
buf[0] = i;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
lb = encode_to - 2;
|
||||
if (i > d[lb].max)
|
||||
return 0;
|
||||
|
||||
buf[0] = (d[lb].b1_mask | (high_bits >> d[lb].high_bit_shift_b1
|
||||
<< d[lb].high_bit_shift_to_right_b1));
|
||||
/* shifts greater than or equal to the bit size of a type are
|
||||
* undefined. Data in the first byte is always aligned with the LSB.
|
||||
*/
|
||||
if (d[lb].data_shift_b1 < sizeof(i) * CHAR_BIT)
|
||||
buf[0] |= i >> d[lb].data_shift_b1;
|
||||
|
||||
buf[1] = 0x80 | ((high_bits & d[lb].high_bit_mask_b2)
|
||||
<< d[lb].high_bit_shift_b2)
|
||||
| ((i >> ((encode_to - 2) * 6))
|
||||
& d[lb].b2_data_mask);
|
||||
|
||||
for (j = 2; j < encode_to; j++) {
|
||||
buf[j] = 0x80 | ((i >> ((encode_to - j - 1) * 6)) & 0x3F);
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
/*************************************************************************
|
||||
* Parsing instructions
|
||||
*
|
||||
|
|
176
test_creole.c
176
test_creole.c
|
@ -3,7 +3,7 @@ GREATEST_MAIN_DEFS();
|
|||
#include "creole.c"
|
||||
|
||||
/**************************************************************************
|
||||
* Reader suite
|
||||
* Reader
|
||||
*************************************************************************/
|
||||
#define reader_lit(r, s) do { \
|
||||
r.p = (unsigned char *)s; \
|
||||
|
@ -37,8 +37,182 @@ SUITE(reader) {
|
|||
RUN_TEST1(reader_test_basic, &r);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Pseudo UTF-8 sequences
|
||||
*************************************************************************/
|
||||
|
||||
TEST no_values(void) {
|
||||
struct creole_reader r;
|
||||
struct word w;
|
||||
r.p = NULL;
|
||||
r.left = 0;
|
||||
ASSERT_EQ(decode_seq(&r, &w), 0);
|
||||
|
||||
PASS();
|
||||
}
|
||||
|
||||
struct seq {
|
||||
creole_word max;
|
||||
unsigned encode_to;
|
||||
unsigned high_bits;
|
||||
|
||||
unsigned char minbuf[7];
|
||||
unsigned char maxbuf[7];
|
||||
};
|
||||
|
||||
void bprint(unsigned char c) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
printf("%u", (c >> (7 - i)) & 1);
|
||||
}
|
||||
}
|
||||
|
||||
void bprintb(unsigned char *b, int len) {
|
||||
while (len-- > 0) {
|
||||
bprint(*b++);
|
||||
printf(" ");
|
||||
}
|
||||
}
|
||||
|
||||
TEST encode_byte_seq(struct seq *s) {
|
||||
creole_word i = 0;
|
||||
int j;
|
||||
unsigned char buf[7];
|
||||
|
||||
for (;;) {
|
||||
/*
|
||||
printf("0x%X ", i);
|
||||
bprintb(s->minbuf, s->encode_to);
|
||||
printf("\n");
|
||||
*/
|
||||
|
||||
ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
|
||||
buf), 1);
|
||||
ASSERT_MEM_EQ(s->minbuf, buf, s->encode_to);
|
||||
|
||||
if (i == s->max)
|
||||
break;
|
||||
i++;
|
||||
|
||||
for (j = s->encode_to - 1; j > 0; j--) {
|
||||
if (s->minbuf[j] == 0xBF) {
|
||||
s->minbuf[j] = 0x80;
|
||||
} else {
|
||||
s->minbuf[j]++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (j == 0)
|
||||
s->minbuf[0]++;
|
||||
}
|
||||
ASSERT_MEM_EQ(s->maxbuf, s->minbuf, s->encode_to);
|
||||
PASS();
|
||||
}
|
||||
|
||||
TEST encode_decode_byte_seq(struct seq *s) {
|
||||
unsigned char buf[7];
|
||||
struct creole_reader r = {0};
|
||||
struct word w;
|
||||
creole_word i = 0;
|
||||
|
||||
for (;;) {
|
||||
ASSERT_EQ(creole_encode(i, s->encode_to, s->high_bits,
|
||||
buf), 1);
|
||||
r.p = buf;
|
||||
r.left = s->encode_to;
|
||||
ASSERT_EQ(decode_seq(&r, &w), 1);
|
||||
ASSERT_EQ(w.len, s->encode_to);
|
||||
ASSERT_EQ(w.high_bits, s->high_bits);
|
||||
ASSERT_EQ(w.word, i);
|
||||
|
||||
if (i == s->max)
|
||||
break;
|
||||
i++;
|
||||
}
|
||||
|
||||
PASS();
|
||||
}
|
||||
|
||||
SUITE(pseudo_utf8_encode_all) {
|
||||
struct seq s;
|
||||
|
||||
RUN_TEST(no_values);
|
||||
|
||||
s.max = 0x7F;
|
||||
s.encode_to = 1;
|
||||
s.high_bits = 0;
|
||||
s.minbuf[0] = 0x00;
|
||||
s.maxbuf[0] = 0x7F;
|
||||
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
for (s.high_bits = 0; s.high_bits < 16; s.high_bits++) {
|
||||
memset(s.maxbuf, 0xBF, sizeof(s.maxbuf));
|
||||
|
||||
s.max = 0x7F;
|
||||
s.encode_to = 2;
|
||||
s.maxbuf[0] = s.minbuf[0] = 0xC0 | (s.high_bits << 1);
|
||||
s.maxbuf[0] = 0xC1 | (s.high_bits << 1);
|
||||
s.minbuf[1] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
s.max = 0xFFF;
|
||||
s.encode_to = 3;
|
||||
s.minbuf[0] = 0xE0 | s.high_bits;
|
||||
s.maxbuf[0] = 0xE0 | s.high_bits;
|
||||
s.minbuf[1] = 0x80;
|
||||
s.minbuf[2] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
s.max = 0x1FFFF;
|
||||
s.encode_to = 4;
|
||||
s.maxbuf[0] = s.minbuf[0] = 0xF0 | (s.high_bits >> 1);
|
||||
s.minbuf[1] = 0x80 | (s.high_bits & 0x1 << 5);
|
||||
s.maxbuf[1] = 0x9F | (s.high_bits & 0x1 << 5);
|
||||
s.minbuf[2] = 0x80;
|
||||
s.minbuf[3] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
s.max = 0x3FFFFF;
|
||||
s.encode_to = 5;
|
||||
s.maxbuf[0] = s.minbuf[0] = 0xF8 | (s.high_bits >> 2);
|
||||
s.minbuf[1] = 0x80 | (s.high_bits & 0x3 << 4);
|
||||
s.maxbuf[1] = 0x8F | (s.high_bits & 0x3 << 4);
|
||||
s.minbuf[2] = 0x80;
|
||||
s.minbuf[3] = 0x80;
|
||||
s.minbuf[4] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
s.max = 0x7FFFFFF;
|
||||
s.encode_to = 6;
|
||||
s.maxbuf[0] = s.minbuf[0] = 0xFC | (s.high_bits >> 3);
|
||||
s.minbuf[1] = 0x80 | (s.high_bits & 0x7 << 3);
|
||||
s.maxbuf[1] = 0x87 | (s.high_bits & 0x7 << 3);
|
||||
s.minbuf[2] = 0x80;
|
||||
s.minbuf[3] = 0x80;
|
||||
s.minbuf[4] = 0x80;
|
||||
s.minbuf[5] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
|
||||
s.max = 0xFFFFFFFF;
|
||||
s.encode_to = 7;
|
||||
s.maxbuf[0] = s.minbuf[0] = 0xFE;
|
||||
s.minbuf[1] = 0x80 | (s.high_bits << 2);
|
||||
s.maxbuf[1] = 0x83 | (s.high_bits << 2);
|
||||
s.minbuf[2] = 0x80;
|
||||
s.minbuf[3] = 0x80;
|
||||
s.minbuf[4] = 0x80;
|
||||
s.minbuf[5] = 0x80;
|
||||
s.minbuf[6] = 0x80;
|
||||
RUN_TEST1(encode_byte_seq, &s);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
GREATEST_MAIN_BEGIN();
|
||||
RUN_SUITE(reader);
|
||||
RUN_SUITE(pseudo_utf8_encode_all);
|
||||
GREATEST_MAIN_END();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue