/* Copyright (c) 2024, Peter McGoron * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1) Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include "uns.h" #include "uns_string.h" #include "cheney_c89.h" static struct uns_gc gc; static struct uns_ctr empty_list; enum token_type { T_EOF, LPAREN, RPAREN, QUOTE, QUASIQUOTE, UNQUOTE, UNQUOTE_LIST, T_IDENT, T_INT, T_FLOAT, T_STRING, T_DOT, TOKEN_NUM }; static const char *token2string_repr[TOKEN_NUM] = { "'EOF", "(", ")", "quote", "quasiquote", "unquote", "unquote-list", "'ident", "'number", "'float", "'string-tok", "'dot" }; struct token { enum token_type typ; struct uns_ctr dat; double f; long i; }; static int is_ws(int c) { return c == '\n' || c == '\t' || c == '\r' || c == '\v' || c == ' '; } static int get_skipws(FILE *input) { int c; for (;;) { c = getc(input); if (c == ';') { do { c = getc(input); } while (c != '\n'); ungetc(c, input); continue; } if (is_ws(c)) continue; return c; } } static void tok_string(FILE *input, struct token *tok) { int c; tok->typ = T_STRING; uns_string_alloc(&gc, &tok->dat, 32); for (;;) { c = getc(input); switch (c) { case '\\': c = getc(input); switch (c) { case ' ': case '\t': case '\r': case '\n': case '\v': do { c = getc(input); } while (is_ws(c)); ungetc(c, input); continue; case 'r': uns_string_append_char(&gc, &tok->dat, '\r'); continue; case 'n': uns_string_append_char(&gc, &tok->dat, '\n'); continue; case 'v': uns_string_append_char(&gc, &tok->dat, '\v'); continue; case 't': uns_string_append_char(&gc, &tok->dat, '\t'); continue; case '"': uns_string_append_char(&gc, &tok->dat, '"'); continue; default: uns_string_append_char(&gc, &tok->dat, '\\'); uns_string_append_char(&gc, &tok->dat, c); } case '"': /* " */ return; } } } static int tonum(int c) { switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return c - '0'; default: return -1; } } static void tok_num(FILE *input, struct token *tok, int c) { int is_float = 0; uns_string_alloc(&gc, &tok->dat, 32); do { if (c == '.' || c == 'e' || c == 'E' || c == '-' || c == '+') is_float = 1; uns_string_append_char(&gc, &tok->dat, c); c = getc(input); } while (tonum(c) >= 0 || c == '.' || c == 'e' || c == 'E' || c == '-' || c == '+'); ungetc(c, input); if (is_float) { tok->typ = T_FLOAT; tok->f = strtod(uns_string_cstring(&gc, &tok->dat), NULL); } else { tok->typ = T_INT; tok->i = strtol(uns_string_cstring(&gc, &tok->dat), NULL, 10); } tok->dat.p = NULL; } static int part_of_ident(int c) { return !is_ws(c) && c != '(' && c != ')' && c != '`' && c != '\'' && c != '\\' && c != '\"' && c != ',' && c != ';'; } static void tok_ident(FILE *input, struct token *tok, int c) { tok->typ = T_IDENT; uns_string_alloc(&gc, &tok->dat, 32); do { uns_string_append_char(&gc, &tok->dat, c); c = getc(input); } while (part_of_ident(c)); ungetc(c, input); } static void tokenize(FILE *input, struct token *tok) { int c = get_skipws(input); int c2; tok->dat.p = NULL; switch (c) { case EOF: tok->typ = T_EOF; return; case '(': tok->typ = LPAREN; return; case ')': tok->typ = RPAREN; return; case '\'': tok->typ = QUOTE; return; case '`': tok->typ = QUASIQUOTE; return; case ',': c = getc(input); if (c == '@') { tok->typ = UNQUOTE_LIST; } else { ungetc(c, input); tok->typ = UNQUOTE; } return; case '"': tok_string(input, tok); return; tok->typ = T_STRING; return; case '+': case '-': c2 = getc(input); ungetc(c2, input); if (tonum(c2) >= 0) { tok_num(input, tok, c); return; } /* FALLTHROUGH */ default: if (c == '.') { c2 = getc(input); if (tonum(c2) >= 0) { ungetc(c2, input); tok_num(input, tok, c); } else if (part_of_ident(c2)) { ungetc(c2, input); tok_ident(input, tok, c); } else { tok->typ = T_DOT; } } else if (tonum(c) >= 0) { tok_num(input, tok, c); } else { tok_ident(input, tok, c); } return; } } enum item_type { CELL, LAMBDA, INTEGER, FLOAT, STRING, SYMBOL, EMPTY_LIST, LISP_NULL }; static void alloc_of_type(struct uns_ctr *ctr, int typ) { void *p; int fields; int i; switch (typ) { case CELL: fields = 2; break; case LAMBDA: fields = 2; break; case INTEGER: fields = 1; break; case STRING: fields = 1; break; case SYMBOL: fields = 1; break; case FLOAT: fields = 1; break; case EMPTY_LIST: fields = 0; break; case LISP_NULL: fields = 0; break; default: abort(); } ctr->p = gc.alloc_record(&gc, fields + 1); p = gc.alloc(&gc, sizeof(int)); memcpy(p, &typ, sizeof(int)); gc.record_set_ptr(ctr->p, 0, p); for (i = 0; i < fields; i++) gc.record_set_ptr(ctr->p, i + 1, NULL); } static int get_type(Uns_ptr p) { int typ; void *innerp; if (!p) return LISP_NULL; innerp = gc.record_get_ptr(p, 0); memcpy(&typ, innerp, sizeof(int)); return typ; } static void alloc_symbol_from_cstring(struct uns_ctr *ctr, const char *s, size_t slen) { struct uns_ctr str = {0}; uns_root_add(&gc, &str); uns_string_alloc(&gc, &str, slen); uns_string_append_bytes(&gc, &str, s, slen); alloc_of_type(ctr, SYMBOL); gc.record_set_ptr(ctr->p, 1, str.p); uns_root_remove(&gc, &str); } enum expr_stack_state { EXPR_STACK_INITIAL, EXPR_STACK_QUOTELIKE, EXPR_STACK_SURROUND_OTHER, EXPR_STACK_START_LIST, EXPR_STACK_IN_LIST, EXPR_STACK_IMPROPER_LIST, EXPR_STACK_IMPROPER_LIST_END, EXPR_STACK_INVALID }; enum expr_stack_fields { EXPR_FIELD_NEXT, EXPR_FIELD_PTR, EXPR_FIELD_STATE, EXPR_FIELD_NUM }; static void expr_stack_push(struct uns_ctr *stack, struct uns_ctr *loc, enum expr_stack_state state) { struct uns_ctr tmp = {0}; uns_root_add(&gc, &tmp); tmp.p = stack->p; stack->p = gc.alloc_record(&gc, EXPR_FIELD_NUM); gc.record_set_ptr(stack->p, EXPR_FIELD_NEXT, tmp.p); uns_root_remove(&gc, &tmp); gc.record_set_ptr(stack->p, EXPR_FIELD_PTR, loc->p); tmp.p = gc.alloc(&gc, sizeof(state)); memcpy(tmp.p, &state, sizeof(state)); gc.record_set_ptr(stack->p, EXPR_FIELD_STATE, tmp.p); } static enum expr_stack_state expr_stack_state(struct uns_ctr *stack) { enum expr_stack_state r; if (!stack->p) return EXPR_STACK_INVALID; memcpy(&r, gc.record_get_ptr(stack->p, EXPR_FIELD_STATE), sizeof(r)); return r; } static void expr_stack_change_state(struct uns_ctr *stack, enum expr_stack_state newst) { memcpy(gc.record_get_ptr(stack->p, EXPR_FIELD_STATE), &newst, sizeof(newst)); } static void expr_stack_ctr(struct uns_ctr *stack, struct uns_ctr *loc) { loc->p = gc.record_get_ptr(stack->p, EXPR_FIELD_PTR); } static int expr_stack_pop(struct uns_ctr *stack) { if (!stack->p) return 0; stack->p = gc.record_get_ptr(stack->p, EXPR_FIELD_NEXT); return 1; } enum parser_return { EXPR_PARSE_OK, EXPR_PARSE_EOF, EXPR_PARSE_INCOMPLETE, EXPR_PARSE_EXCESS_RPAREN, EXPR_PARSE_INVALID_EMPTY_LIST, EXPR_PARSE_IMPROPER_LIST_OVERFLOW, EXPR_PARSE_BAD_IMPROPER_LIST, EXPR_PARSE_BAD_STACK_START, EXPR_PARSE_BAD_QUOTE, EXPR_PARSE_INTERNAL_ERROR }; static int tok_to_type(enum token_type tokt) { switch (tokt) { case T_IDENT: return SYMBOL; case T_STRING: return STRING; default: return -1; } } /* Parse expressions using an explicit stack. * This is feasible to to hand-written and has the advantage that error * recovery is much easier than recursion (implicit stack). * * The stack consists of a LIFO: * [previous value][container][parser state] * * The implementation stores containers on the stack and when a container * is complete or an atom is read, it is stored in "expr" and the stack * is modified. */ static enum parser_return expr_parse(FILE *input, struct uns_ctr *expr) { struct uns_ctr stack = {0}; struct uns_ctr loc = {0}; struct uns_ctr new_cell = {0}; struct token tok = {0}; int store = 0; enum parser_return r = EXPR_PARSE_OK; uns_root_add(&gc, &stack); uns_root_add(&gc, &loc); uns_root_add(&gc, &tok.dat); uns_root_add(&gc, &new_cell); /* Allocate expr->p to be a single pointer. */ expr->p = gc.alloc_record(&gc, 1); /* Initialize the stack with the final result. */ expr_stack_push(&stack, expr, EXPR_STACK_INITIAL); while (stack.p) { tokenize(input, &tok); switch (tok.typ) { case LPAREN: /* Push a new cell onto the stack and parse it later. */ alloc_of_type(expr, CELL); expr_stack_push(&stack, expr, EXPR_STACK_START_LIST); break; case QUOTE: case QUASIQUOTE: case UNQUOTE: case UNQUOTE_LIST: /* push (token-name ) onto the stack, and have * it point to the first cell. */ alloc_of_type(&loc, CELL); alloc_symbol_from_cstring(expr, token2string_repr[tok.typ], strlen(token2string_repr[tok.typ])); gc.record_set_ptr(loc.p, 1, expr->p); alloc_of_type(expr, CELL); gc.record_set_ptr(loc.p, 2, expr->p); gc.record_set_ptr(expr->p, 2, empty_list.p); if (tok.typ == QUOTE || tok.typ == QUASIQUOTE) expr_stack_push(&stack, &loc, EXPR_STACK_QUOTELIKE); else expr_stack_push(&stack, &loc, EXPR_STACK_SURROUND_OTHER); break; /* All atoms are allocated into "expr" and will be stored in the * container after the switch. */ case T_IDENT: case T_STRING: alloc_of_type(expr, tok_to_type(tok.typ)); gc.record_set_ptr(expr->p, 1, tok.dat.p); store = 1; break; case T_INT: alloc_of_type(expr, INTEGER); loc.p = gc.alloc(&gc, sizeof(tok.i)); memcpy(loc.p, &tok.i, sizeof(tok.i)); gc.record_set_ptr(expr->p, 1, loc.p); store = 1; break; case T_FLOAT: alloc_of_type(expr, FLOAT); loc.p = gc.alloc(&gc, sizeof(tok.f)); memcpy(loc.p, &tok.f, sizeof(tok.f)); gc.record_set_ptr(expr->p, 1, loc.p); store = 1; break; case RPAREN: switch (expr_stack_state(&stack)) { case EXPR_STACK_START_LIST: /* Empty list found. Pop the cell and discard it. * If we are not in a "surround" state, report error. * Replace the surround state with an empty list. */ expr_stack_pop(&stack); if (!stack.p) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } if (expr_stack_state(&stack) != EXPR_STACK_QUOTELIKE) { r = EXPR_PARSE_INVALID_EMPTY_LIST; goto end; } /* Discard quote/quasiquote and store the empty list instead. */ if (!expr_stack_pop(&stack)) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } expr->p = empty_list.p; store = 1; break; case EXPR_STACK_IN_LIST: /* In a list, there are two things on the stack: * first cell and last cell. * Place the empty list in the cdr of the last cell, * and then discard the last cell. Put the first cell into expr * and store it in the container in the stack before it. */ expr_stack_ctr(&stack, &loc); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } gc.record_set_ptr(loc.p, 2, empty_list.p); /* FALLTHROUGH */ case EXPR_STACK_IMPROPER_LIST_END: /* CDR of the list has been filled. Pop the last cell. */ expr_stack_pop(&stack); if (!stack.p) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } /* Put the first cell in expr, and read it into the next item * of the stack. */ expr_stack_ctr(&stack, expr); expr_stack_pop(&stack); store = 1; break; case EXPR_STACK_INITIAL: /* Expression is not balanced. */ r = EXPR_PARSE_EXCESS_RPAREN; goto end; case EXPR_STACK_IMPROPER_LIST: /* (values .) is an invalid expression. */ r = EXPR_PARSE_BAD_IMPROPER_LIST; goto end; case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER: /* ,) ') etc are invalid */ r = EXPR_PARSE_BAD_QUOTE; goto end; case EXPR_STACK_INVALID: r = EXPR_PARSE_INTERNAL_ERROR; goto end; } break; case T_DOT: if (expr_stack_state(&stack) != EXPR_STACK_IN_LIST) { r = EXPR_PARSE_BAD_IMPROPER_LIST; goto end; } expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST); break; case T_EOF: /* If EOF is encountered in the middle of an expression, repor * that. Otherwise, the parser reads nothing. */ if (expr_stack_state(&stack) != EXPR_STACK_INITIAL) { r = EXPR_PARSE_INCOMPLETE; goto end; } r = EXPR_PARSE_EOF; goto end; case TOKEN_NUM: r = EXPR_PARSE_INTERNAL_ERROR; goto end; } /* If store is true, that means "expr" contains a value to * be placed into the container at the top of the stack. * * This is a loop because some cases need to cascade values up the * stack without acting on a token. When the loop stops, further * action requires a new token to be read. */ while (store) switch (expr_stack_state(&stack)) { case EXPR_STACK_INVALID: r = EXPR_PARSE_INTERNAL_ERROR; goto end; case EXPR_STACK_START_LIST: /* The current thing on the stack is the first part of the * list. Add the expression to car of the cell, and add * push a pointer to the cell. This is the last cell in * the list. */ expr_stack_ctr(&stack, &loc); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } gc.record_set_ptr(loc.p, 1, expr->p); expr_stack_push(&stack, &loc, EXPR_STACK_IN_LIST); store = 0; break; case EXPR_STACK_IN_LIST: /* The pointer on the top of the stack is the end of the * list. Add a new list to it's cdr, and put expr into * the car of the new end of the list. */ expr_stack_ctr(&stack, &loc); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } alloc_of_type(&new_cell, CELL); gc.record_set_ptr(loc.p, 2, new_cell.p); gc.record_set_ptr(new_cell.p, 1, expr->p); expr_stack_pop(&stack); expr_stack_push(&stack, &new_cell, EXPR_STACK_IN_LIST); store = 0; break; case EXPR_STACK_IMPROPER_LIST: /* Add the expression to the cdr of the last element. * Set the state to expect the end of an improper list. */ expr_stack_ctr(&stack, &loc); gc.record_set_ptr(loc.p, 2, expr->p); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST_END); store = 0; break; case EXPR_STACK_IMPROPER_LIST_END: r = EXPR_PARSE_IMPROPER_LIST_OVERFLOW; goto end; case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER: /* Expression at the top of the stack is * (surrouding-thing ) * Replace (car (cdr (top-of-stack)) with expr. */ expr_stack_ctr(&stack, &loc); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } loc.p = gc.record_get_ptr(loc.p, 2); if (get_type(loc.p) != CELL) { r = EXPR_PARSE_INTERNAL_ERROR; goto end; } gc.record_set_ptr(loc.p, 1, expr->p); /* Get the head of the quoted value again. */ expr_stack_ctr(&stack, expr); expr_stack_pop(&stack); /* This value needs to be stored somewhere. Continue looping. */ store = 1; break; case EXPR_STACK_INITIAL: /* Finished parsing an expression at toplevel. */ expr_stack_ctr(&stack, &loc); gc.record_set_ptr(loc.p, 0, expr->p); /* Prepare for return. */ expr->p = loc.p; expr_stack_pop(&stack); if (stack.p) { r = EXPR_PARSE_BAD_STACK_START; goto end; } store = 0; break; } } end: uns_root_remove(&gc, &stack); uns_root_remove(&gc, &loc); uns_root_remove(&gc, &tok.dat); uns_root_remove(&gc, &new_cell); return r; } static void oom(struct uns_gc *gc_) { (void)gc_; printf("oom\n"); abort(); } static void after_gc(struct uns_gc *gc_) { (void)gc_; fprintf(stderr, "The garbage collector has run %ld times\n" "\tbefore collection: %lu\n" "\tafter collection: %lu\n", gc.collection_number, gc.before_collection, gc.after_collection ); if (gc.after_collection >= gc.before_collection * 7/10) { fprintf(stderr, "\tincreasing\n"); gc.next_alloc *= 2; } } static void init_gc(void) { gc.next_alloc = 512; gc.oom = oom; gc.after_gc = after_gc; gc.ctx = malloc(uns_cheney_c89_ctx_size); if(!gc.ctx || !uns_cheney_c89_init(&gc)) exit(1); gc.next_alloc *= 2; uns_root_add(&gc, &empty_list); alloc_of_type(&empty_list, EMPTY_LIST); } static void display(struct uns_ctr *ctr, long indent) { struct uns_ctr tmp = {0}; long l; double f; if (!ctr->p) { printf(""); return; } switch (get_type(ctr->p)) { case CELL: uns_root_add(&gc, &tmp); printf("("); tmp.p = gc.record_get_ptr(ctr->p, 1); display(&tmp, indent); ctr->p = gc.record_get_ptr(ctr->p, 2); while (get_type(ctr->p) == CELL) { tmp.p = gc.record_get_ptr(ctr->p, 1); if (get_type(tmp.p) == CELL) { printf("\n"); for (l = 0; l < indent; l++) printf(" "); display(&tmp, indent + 1); } else { printf(" "); display(&tmp, indent); } ctr->p = gc.record_get_ptr(ctr->p, 2); } switch (get_type(ctr->p)) { case EMPTY_LIST: printf(")"); break; default: printf(" . "); display(ctr, indent); printf(")"); break; } uns_root_remove(&gc, &tmp); return; case INTEGER: memcpy(&l, gc.record_get_ptr(ctr->p, 1), sizeof(long)); printf("%ld", l); return; case FLOAT: memcpy(&f, gc.record_get_ptr(ctr->p, 1), sizeof(double)); printf("%f", f); return; case STRING: tmp.p = gc.record_get_ptr(ctr->p, 1); uns_root_add(&gc, &tmp); printf("\"%s\"", uns_string_cstring(&gc, &tmp)); uns_root_remove(&gc, &tmp); return; case SYMBOL: tmp.p = gc.record_get_ptr(ctr->p, 1); uns_root_add(&gc, &tmp); printf("%s", uns_string_cstring(&gc, &tmp)); uns_root_remove(&gc, &tmp); return; case EMPTY_LIST: printf("'()"); return; default: abort(); } } /* Evaluation: * * Symbols are looked up for their value in the environment. * Numbers, strings, etc. are returned as-is. * Quote returns the list verbatim. Quasiquote runs another evaluation * procedure. * * All function calls evalute the arguments, left to right. If the arguments * are not a list, then it evaluated. * * A tail-call of the form ((__unilambda l body) . args) binds the evaluated * args to l and jumps to body, without pushing a stack. * * Any other tail-call of the form (function . args) will push a new stack * with the new values passed to the function, the previous stack removed and * the new stack linked to the previous-previous stack. * * A non-tail call will push a new stack with a link to the previous stack. * * A lambda that is not evaluated immediately is stored with a link to the * top stack frame, the binding symbol and the code verbatim. * * A tail-call is either in the body of __unilambda or the latter value in * "if". The repl is not a tail-call position. * * If "(current-continuation)" is the argument of a call, then the * continuation is passed to the argument. The continuation proceeds in the * body. * * If "(current-continuation)" is the body of a lambda, then the stack is * unwound until the result will be the argument to another lambda. * * A continuation is a pointer to the stack frame and the expression to * evaluate. */ int main(void) { struct uns_ctr expr = {0}; init_gc(); uns_root_add(&gc, &expr); while (!feof(stdin)) { expr.p = NULL; switch (expr_parse(stdin, &expr)) { case EXPR_PARSE_OK: expr.p = gc.record_get_ptr(expr.p, 0); display(&expr, 1); printf("\n"); break; case EXPR_PARSE_INCOMPLETE: fprintf(stderr, "EOF before expression was finished\n"); break; case EXPR_PARSE_EXCESS_RPAREN: fprintf(stderr, "Unbalanced parentheses\n"); break; case EXPR_PARSE_INVALID_EMPTY_LIST: fprintf(stderr, "Invalid syntax for empty list (either '() or `'())\n"); break; case EXPR_PARSE_BAD_QUOTE: fprintf(stderr, "Invalid syntax for quotes/unquotes\n"); break; case EXPR_PARSE_IMPROPER_LIST_OVERFLOW: fprintf(stderr, "too many values at end of improper list (must be exactly one)\n"); case EXPR_PARSE_BAD_IMPROPER_LIST: fprintf(stderr, "Bad syntax for improper list (must be (value . value))\n"); break; case EXPR_PARSE_INTERNAL_ERROR: fprintf(stderr, "Bug in implementation\n"); break; case EXPR_PARSE_BAD_STACK_START: fprintf(stderr, "Bug: bottom of stack not correct\n"); break; case EXPR_PARSE_EOF: fprintf(stderr, "EOF\n"); break; } } uns_root_remove(&gc, &expr); uns_cheney_c89_deinit(&gc); return 0; }