From b897cc6d25c059f4963f83e7c974ab5a4d436897 Mon Sep 17 00:00:00 2001 From: Peter McGoron Date: Sun, 23 Jun 2024 00:30:44 -0400 Subject: [PATCH] new expression parser with explicit stack for error handling --- main.c | 589 +++++++++++++++++++++++++++++++++++++++------------- prelude.scm | 64 ++++-- 2 files changed, 480 insertions(+), 173 deletions(-) diff --git a/main.c b/main.c index d817e15..759f387 100644 --- a/main.c +++ b/main.c @@ -49,21 +49,20 @@ enum token_type { TOKEN_NUM }; -#if 0 -static const char *token2string[TOKEN_NUM] = { - "EOF", +static const char *token2string_repr[TOKEN_NUM] = { + "'EOF", "(", ")", - "'", - "`", - ",", - ",@", - "IDENT", - "NUMBER_TOK", - "STRING_TOK", - "." + "quote", + "quasiquote", + "unquote", + "unquote-list", + "'ident", + "'number", + "'float", + "'string-tok", + "'dot" }; -#endif struct token { enum token_type typ; @@ -286,6 +285,7 @@ static void alloc_of_type(struct uns_ctr *ctr, int typ) case FLOAT: fields = 1; break; case EMPTY_LIST: fields = 0; break; case LISP_NULL: fields = 0; break; + default: abort(); } ctr->p = gc.alloc_record(&gc, fields + 1); @@ -324,161 +324,382 @@ static void alloc_symbol_from_cstring(struct uns_ctr *ctr, const char *s, size_t uns_root_remove(&gc, &str); } -static void alloc_integer(struct uns_ctr *ctr, long l) -{ - void *p; - alloc_of_type(ctr, INTEGER); +enum expr_stack_state { + EXPR_STACK_INITIAL, + EXPR_STACK_QUOTELIKE, + EXPR_STACK_SURROUND_OTHER, + EXPR_STACK_START_LIST, + EXPR_STACK_IN_LIST, + EXPR_STACK_IMPROPER_LIST, + EXPR_STACK_IMPROPER_LIST_END, + EXPR_STACK_INVALID +}; - p = gc.alloc(&gc, sizeof(long)); - memcpy(p, &l, sizeof(long)); - gc.record_set_ptr(ctr->p, 1, p); -} +enum expr_stack_fields { + EXPR_FIELD_NEXT, + EXPR_FIELD_PTR, + EXPR_FIELD_STATE, + EXPR_FIELD_NUM +}; -static void alloc_float(struct uns_ctr *ctr, double f) -{ - void *p; - alloc_of_type(ctr, FLOAT); - - p = gc.alloc(&gc, sizeof(double)); - memcpy(p, &f, sizeof(double)); - gc.record_set_ptr(ctr->p, 1, p); -} - -static int expr_all(FILE *input, struct token *tok, struct uns_ctr *expr); - -static int list_expr(FILE *input, struct uns_ctr *expr, struct token *tok) -{ - struct uns_ctr in_car = {0}; - struct uns_ctr in_cdr = {0}; - struct uns_ctr cur_head = {0}; - int r = 1; - - uns_root_add(&gc, &in_car); - uns_root_add(&gc, &in_cdr); - uns_root_add(&gc, &cur_head); - - alloc_of_type(expr, CELL); - cur_head.p = expr->p; - - for (;;) { - if (!expr_all(input, tok, &in_car)) { - r = 0; - goto end; - } - - gc.record_set_ptr(cur_head.p, 1, in_car.p); - - tokenize(input, tok); - if (tok->typ == RPAREN) - break; - - alloc_of_type(&in_cdr, CELL); - - if (tok->typ == T_DOT) { - tokenize(input, tok); - expr_all(input, tok, &in_cdr); - gc.record_set_ptr(cur_head.p, 2, in_cdr.p); - tokenize(input, tok); - if (tok->typ != RPAREN) { - r = 0; - } - - goto end; - } - - gc.record_set_ptr(cur_head.p, 2, in_cdr.p); - cur_head.p = in_cdr.p; - } - - gc.record_set_ptr(cur_head.p, 2, empty_list.p); -end: - uns_root_remove(&gc, &in_car); - uns_root_remove(&gc, &in_cdr); - uns_root_remove(&gc, &cur_head); - return r; -} - -static int surround_expr(FILE *input, struct uns_ctr *expr, struct token *tok, const char *name) +static void expr_stack_push(struct uns_ctr *stack, struct uns_ctr *loc, enum expr_stack_state state) { struct uns_ctr tmp = {0}; - struct uns_ctr quoted = {0}; - int r = 0; uns_root_add(&gc, &tmp); - uns_root_add(&gc, "ed); - - alloc_of_type(expr, CELL); - - alloc_symbol_from_cstring(&tmp, name, strlen(name)); - gc.record_set_ptr(expr->p, 1, tmp.p); - - alloc_of_type(&tmp, CELL); - gc.record_set_ptr(expr->p, 2, tmp.p); - - tokenize(input, tok); - if (expr_all(input, tok, "ed)) { - r = 1; - gc.record_set_ptr(tmp.p, 1, quoted.p); - gc.record_set_ptr(tmp.p, 2, empty_list.p); - } - + tmp.p = stack->p; + stack->p = gc.alloc_record(&gc, EXPR_FIELD_NUM); + gc.record_set_ptr(stack->p, EXPR_FIELD_NEXT, tmp.p); uns_root_remove(&gc, &tmp); - uns_root_remove(&gc, "ed); + + gc.record_set_ptr(stack->p, EXPR_FIELD_PTR, loc->p); + + tmp.p = gc.alloc(&gc, sizeof(state)); + memcpy(tmp.p, &state, sizeof(state)); + gc.record_set_ptr(stack->p, EXPR_FIELD_STATE, tmp.p); +} + +static enum expr_stack_state expr_stack_state(struct uns_ctr *stack) +{ + enum expr_stack_state r; + if (!stack->p) + return EXPR_STACK_INVALID; + memcpy(&r, gc.record_get_ptr(stack->p, EXPR_FIELD_STATE), + sizeof(r)); return r; } -/* expr does not call tokenizer directly: it acts on an input token */ -static int expr_all(FILE *input, struct token *tok, struct uns_ctr *expr) +static void expr_stack_change_state(struct uns_ctr *stack, enum expr_stack_state newst) { - switch (tok->typ) { - case LPAREN: - tokenize(input, tok); - if (tok->typ == RPAREN) - expr->p = empty_list.p; - else - return list_expr(input, expr, tok); - break; - case QUOTE: - return surround_expr(input, expr, tok, "quote"); - case QUASIQUOTE: - return surround_expr(input, expr, tok, "quasiquote"); - case UNQUOTE: - return surround_expr(input, expr, tok, "unquote"); - case UNQUOTE_LIST: - return surround_expr(input, expr, tok, "unquote-list"); - case T_IDENT: - alloc_of_type(expr, SYMBOL); - gc.record_set_ptr(expr->p, 1, tok->dat.p); - break; - case T_INT: - alloc_integer(expr, tok->i); - break; - case T_FLOAT: - alloc_float(expr, tok->f); - break; - case T_STRING: - alloc_of_type(expr, STRING); - gc.record_set_ptr(expr->p, 1, tok->dat.p); - break; - case RPAREN: case T_EOF: case TOKEN_NUM: case T_DOT: - return 0; - } + memcpy(gc.record_get_ptr(stack->p, EXPR_FIELD_STATE), + &newst, + sizeof(newst)); +} +static void expr_stack_ctr(struct uns_ctr *stack, struct uns_ctr *loc) +{ + loc->p = gc.record_get_ptr(stack->p, EXPR_FIELD_PTR); +} + +static int expr_stack_pop(struct uns_ctr *stack) +{ + if (!stack->p) + return 0; + stack->p = gc.record_get_ptr(stack->p, EXPR_FIELD_NEXT); return 1; } -static int read_next(FILE *input, struct uns_ctr *expr) +enum parser_return { + EXPR_PARSE_OK, + EXPR_PARSE_EOF, + EXPR_PARSE_INCOMPLETE, + EXPR_PARSE_EXCESS_RPAREN, + EXPR_PARSE_INVALID_EMPTY_LIST, + EXPR_PARSE_IMPROPER_LIST_OVERFLOW, + EXPR_PARSE_BAD_IMPROPER_LIST, + EXPR_PARSE_BAD_STACK_START, + EXPR_PARSE_BAD_QUOTE, + EXPR_PARSE_INTERNAL_ERROR +}; + +static int tok_to_type(enum token_type tokt) { + switch (tokt) { + case T_IDENT: return SYMBOL; + case T_STRING: return STRING; + default: return -1; + } +} + +/* Parse expressions using an explicit stack. + * This is feasible to to hand-written and has the advantage that error + * recovery is much easier than recursion (implicit stack). + * + * The stack consists of a LIFO: + * [previous value][container][parser state] + * + * The implementation stores containers on the stack and when a container + * is complete or an atom is read, it is stored in "expr" and the stack + * is modified. + */ +static enum parser_return expr_parse(FILE *input, struct uns_ctr *expr) +{ + struct uns_ctr stack = {0}; + struct uns_ctr loc = {0}; + struct uns_ctr new_cell = {0}; struct token tok = {0}; - int r = 0; + int store = 0; + enum parser_return r = EXPR_PARSE_OK; + uns_root_add(&gc, &stack); + uns_root_add(&gc, &loc); uns_root_add(&gc, &tok.dat); - tokenize(input, &tok); - expr->p = NULL; - if (tok.typ != T_EOF) - r = expr_all(input, &tok, expr); - uns_root_remove(&gc, &tok.dat); + uns_root_add(&gc, &new_cell); + /* Allocate expr->p to be a single pointer. */ + expr->p = gc.alloc_record(&gc, 1); + + /* Initialize the stack with the final result. */ + expr_stack_push(&stack, expr, EXPR_STACK_INITIAL); + + while (stack.p) { + tokenize(input, &tok); + + switch (tok.typ) { + case LPAREN: + /* Push a new cell onto the stack and parse it later. */ + alloc_of_type(expr, CELL); + expr_stack_push(&stack, expr, EXPR_STACK_START_LIST); + break; + case QUOTE: case QUASIQUOTE: case UNQUOTE: case UNQUOTE_LIST: + /* push (token-name ) onto the stack, and have + * it point to the first cell. + */ + alloc_of_type(&loc, CELL); + alloc_symbol_from_cstring(expr, token2string_repr[tok.typ], + strlen(token2string_repr[tok.typ])); + gc.record_set_ptr(loc.p, 1, expr->p); + + alloc_of_type(expr, CELL); + gc.record_set_ptr(loc.p, 2, expr->p); + gc.record_set_ptr(expr->p, 2, empty_list.p); + + if (tok.typ == QUOTE || tok.typ == QUASIQUOTE) + expr_stack_push(&stack, &loc, EXPR_STACK_QUOTELIKE); + else + expr_stack_push(&stack, &loc, EXPR_STACK_SURROUND_OTHER); + break; + + /* All atoms are allocated into "expr" and will be stored in the + * container after the switch. + */ + case T_IDENT: case T_STRING: + alloc_of_type(expr, tok_to_type(tok.typ)); + gc.record_set_ptr(expr->p, 1, tok.dat.p); + store = 1; + break; + case T_INT: + alloc_of_type(expr, INTEGER); + loc.p = gc.alloc(&gc, sizeof(tok.i)); + memcpy(loc.p, &tok.i, sizeof(tok.i)); + gc.record_set_ptr(expr->p, 1, loc.p); + store = 1; + break; + case T_FLOAT: + alloc_of_type(expr, FLOAT); + loc.p = gc.alloc(&gc, sizeof(tok.f)); + memcpy(loc.p, &tok.f, sizeof(tok.f)); + gc.record_set_ptr(expr->p, 1, loc.p); + store = 1; + break; + + case RPAREN: + switch (expr_stack_state(&stack)) { + case EXPR_STACK_START_LIST: + /* Empty list found. Pop the cell and discard it. + * If we are not in a "surround" state, report error. + * Replace the surround state with an empty list. + */ + expr_stack_pop(&stack); + if (!stack.p) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + if (expr_stack_state(&stack) != EXPR_STACK_QUOTELIKE) { + r = EXPR_PARSE_INVALID_EMPTY_LIST; + goto end; + } + + /* Discard quote/quasiquote and store the empty list instead. */ + if (!expr_stack_pop(&stack)) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + expr->p = empty_list.p; + store = 1; + break; + case EXPR_STACK_IN_LIST: + /* In a list, there are two things on the stack: + * first cell and last cell. + * Place the empty list in the cdr of the last cell, + * and then discard the last cell. Put the first cell into expr + * and store it in the container in the stack before it. + */ + expr_stack_ctr(&stack, &loc); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + gc.record_set_ptr(loc.p, 2, empty_list.p); + /* FALLTHROUGH */ + case EXPR_STACK_IMPROPER_LIST_END: + /* CDR of the list has been filled. Pop the last cell. */ + expr_stack_pop(&stack); + if (!stack.p) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + /* Put the first cell in expr, and read it into the next item + * of the stack. + */ + expr_stack_ctr(&stack, expr); + expr_stack_pop(&stack); + store = 1; + break; + case EXPR_STACK_INITIAL: + /* Expression is not balanced. */ + r = EXPR_PARSE_EXCESS_RPAREN; + goto end; + case EXPR_STACK_IMPROPER_LIST: + /* (values .) is an invalid expression. */ + r = EXPR_PARSE_BAD_IMPROPER_LIST; + goto end; + case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER: + /* ,) ') etc are invalid */ + r = EXPR_PARSE_BAD_QUOTE; + goto end; + case EXPR_STACK_INVALID: + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + break; + case T_DOT: + if (expr_stack_state(&stack) != EXPR_STACK_IN_LIST) { + r = EXPR_PARSE_BAD_IMPROPER_LIST; + goto end; + } + + expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST); + break; + case T_EOF: + /* If EOF is encountered in the middle of an expression, repor + * that. Otherwise, the parser reads nothing. + */ + if (expr_stack_state(&stack) != EXPR_STACK_INITIAL) { + r = EXPR_PARSE_INCOMPLETE; + goto end; + } + r = EXPR_PARSE_EOF; + goto end; + case TOKEN_NUM: + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + /* If store is true, that means "expr" contains a value to + * be placed into the container at the top of the stack. + * + * This is a loop because some cases need to cascade values up the + * stack without acting on a token. When the loop stops, further + * action requires a new token to be read. + */ + while (store) switch (expr_stack_state(&stack)) { + case EXPR_STACK_INVALID: + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + case EXPR_STACK_START_LIST: + /* The current thing on the stack is the first part of the + * list. Add the expression to car of the cell, and add + * push a pointer to the cell. This is the last cell in + * the list. + */ + expr_stack_ctr(&stack, &loc); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + gc.record_set_ptr(loc.p, 1, expr->p); + expr_stack_push(&stack, &loc, EXPR_STACK_IN_LIST); + store = 0; + break; + case EXPR_STACK_IN_LIST: + /* The pointer on the top of the stack is the end of the + * list. Add a new list to it's cdr, and put expr into + * the car of the new end of the list. + */ + expr_stack_ctr(&stack, &loc); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + alloc_of_type(&new_cell, CELL); + gc.record_set_ptr(loc.p, 2, new_cell.p); + gc.record_set_ptr(new_cell.p, 1, expr->p); + expr_stack_pop(&stack); + expr_stack_push(&stack, &new_cell, EXPR_STACK_IN_LIST); + store = 0; + break; + case EXPR_STACK_IMPROPER_LIST: + /* Add the expression to the cdr of the last element. + * Set the state to expect the end of an improper list. + */ + expr_stack_ctr(&stack, &loc); + gc.record_set_ptr(loc.p, 2, expr->p); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST_END); + store = 0; + break; + case EXPR_STACK_IMPROPER_LIST_END: + r = EXPR_PARSE_IMPROPER_LIST_OVERFLOW; + goto end; + case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER: + /* Expression at the top of the stack is + * (surrouding-thing ) + * Replace (car (cdr (top-of-stack)) with expr. + */ + expr_stack_ctr(&stack, &loc); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + loc.p = gc.record_get_ptr(loc.p, 2); + if (get_type(loc.p) != CELL) { + r = EXPR_PARSE_INTERNAL_ERROR; + goto end; + } + + gc.record_set_ptr(loc.p, 1, expr->p); + + /* Get the head of the quoted value again. */ + expr_stack_ctr(&stack, expr); + expr_stack_pop(&stack); + + /* This value needs to be stored somewhere. Continue looping. */ + store = 1; + break; + case EXPR_STACK_INITIAL: + /* Finished parsing an expression at toplevel. */ + expr_stack_ctr(&stack, &loc); + gc.record_set_ptr(loc.p, 0, expr->p); + + /* Prepare for return. */ + expr->p = loc.p; + expr_stack_pop(&stack); + if (stack.p) { + r = EXPR_PARSE_BAD_STACK_START; + goto end; + } + store = 0; + break; + } + } + +end: + uns_root_remove(&gc, &stack); + uns_root_remove(&gc, &loc); + uns_root_remove(&gc, &tok.dat); + uns_root_remove(&gc, &new_cell); return r; } @@ -580,6 +801,42 @@ static void display(struct uns_ctr *ctr) } } +/* Evaluation: + * + * Symbols are looked up for their value in the environment. + * Numbers, strings, etc. are returned as-is. + * Quote returns the list verbatim. Quasiquote runs another evaluation + * procedure. + * + * All function calls evalute the arguments, left to right. If the arguments + * are not a list, then it evaluated. + * + * A tail-call of the form ((__unilambda l body) . args) binds the evaluated + * args to l and jumps to body, without pushing a stack. + * + * Any other tail-call of the form (function . args) will push a new stack + * with the new values passed to the function, the previous stack removed and + * the new stack linked to the previous-previous stack. + * + * A non-tail call will push a new stack with a link to the previous stack. + * + * A lambda that is not evaluated immediately is stored with a link to the + * top stack frame, the binding symbol and the code verbatim. + * + * A tail-call is either in the body of __unilambda or the latter value in + * "if". The repl is not a tail-call position. + * + * If "(current-continuation)" is the argument of a call, then the + * continuation is passed to the argument. The continuation proceeds in the + * body. + * + * If "(current-continuation)" is the body of a lambda, then the stack is + * unwound until the result will be the argument to another lambda. + * + * A continuation is a pointer to the stack frame and the expression to + * evaluate. + */ + int main(void) { struct uns_ctr expr = {0}; @@ -587,9 +844,41 @@ int main(void) init_gc(); uns_root_add(&gc, &expr); - while (read_next(stdin, &expr)) { - display(&expr); - printf("\n"); + while (!feof(stdin)) { + expr.p = NULL; + switch (expr_parse(stdin, &expr)) { + case EXPR_PARSE_OK: + expr.p = gc.record_get_ptr(expr.p, 0); + display(&expr); + printf("\n"); + break; + case EXPR_PARSE_INCOMPLETE: + fprintf(stderr, "EOF before expression was finished\n"); + break; + case EXPR_PARSE_EXCESS_RPAREN: + fprintf(stderr, "Unbalanced parentheses\n"); + break; + case EXPR_PARSE_INVALID_EMPTY_LIST: + fprintf(stderr, "Invalid syntax for empty list (either '() or `'())\n"); + break; + case EXPR_PARSE_BAD_QUOTE: + fprintf(stderr, "Invalid syntax for quotes/unquotes\n"); + break; + case EXPR_PARSE_IMPROPER_LIST_OVERFLOW: + fprintf(stderr, "too many values at end of improper list (must be exactly one)\n"); + case EXPR_PARSE_BAD_IMPROPER_LIST: + fprintf(stderr, "Bad syntax for improper list (must be (value . value))"); + break; + case EXPR_PARSE_INTERNAL_ERROR: + fprintf(stderr, "Bug in implementation\n"); + break; + case EXPR_PARSE_BAD_STACK_START: + fprintf(stderr, "Bug: bottom of stack not correct\n"); + break; + case EXPR_PARSE_EOF: + fprintf(stderr, "EOF\n"); + break; + } } uns_root_remove(&gc, &expr); diff --git a/prelude.scm b/prelude.scm index cb22b18..61aefce 100644 --- a/prelude.scm +++ b/prelude.scm @@ -30,10 +30,10 @@ ; __define: define in the environment ; __define-macro: Like __define but for macros ; set! -; if +; __exec: apply an argument to a lambda and return it to the continuation. +; __continue-if: continuation version of if. ; ; Others: -; __uniapply: Apply function to single argument ; : A value that can only be passed to __unilambda. It cannot ; be bound to a top-level, or "set!", or added to a list. ; It can also be the parameter of a __unilambda, although @@ -41,6 +41,15 @@ ; ; Macros are functions that return an sexpr. +(__define-macro if + (__unilambda l + (__continue-if (car l) + (__unilambda (car (cdr l))) + (__unilambda (car (cdr (cdr l)))) + ) + ) +) + (__define-macro let (__unilambda l ; car = args, cdr = body (if (null? (car args)) @@ -48,9 +57,11 @@ ; (car args) = ((A a) (B b) ...) ; (car (car args)) = (A a) ; (cdr (car (car args))) = (a) - `((__unilambda ,(car (car (car args))) - (let ,(cdr (car args)) ,@body)) - . ,(car (cdr (car (car args)))) + `(__exec (__unilambda ,(car (cdr (car (car args))))) + + (__unilambda ,(car (car (car args))) + (let ,(cdr (car args)) ,@body) + ) ) ) ) @@ -58,16 +69,17 @@ (__define-macro let* let) -; if and __unilambda only execute one statement. This uses Flatrate's -; evaluation order (arguments first) and tail-call optimization to simulate -; multiple statements. +; if and __unilambda only execute one statement. (__define-macro begin body (let ((first (car body)) (rest (cdr body)) ) (if (null? rest) first - `((__unilambda (begin ,@rest)) . ,first) + `(__exec (__unilambda ,first) + + (__unilambda (begin ,@rest)) + ) ) ) ) @@ -85,14 +97,17 @@ (begin ,@body) ) ) - (let* ((argval (cons args)) - (rest (cdr args)) - (arg (cons argval)) - (val (cons (cdr argval))) + (if (symbol? args) + `(__unilambda ,args ,@body) + (let* ((argval (cons args)) + (rest (cdr args)) + (arg (cons argval)) + (val (cons (cdr argval))) + ) + `(__unilambda ,larg + (let ((,arg ,val)) __bindlambda ,larg ,rest ,@body) ) - `(__unilambda ,larg - (let ((,arg ,val)) __bindlambda ,larg ,rest ,@body) - ) + ) ) ) ) @@ -118,13 +133,11 @@ ) (__define-macro define-macro - (__unilambda l - (let* ((name-and-args (car l)) - (name (car name-and-args)) - (args (cdr name-and-args)) - (body (cdr l)) - (tmpname (gensym)) - ) + (lambda (name-and-args . body) + (let ((name (car name-and-args)) + (args (cdr name-and-args)) + (tmpname (gensym)) + ) `(__define-macro ,name (lambda ,args ,@body) ) @@ -189,6 +202,11 @@ ;;;;;;;;;;;;;; Standard Library ;;;;;;;;;;;;;;;;;;; +(define (call-with-current-continuation f) + (f (current-continuation)) +) +(define call/cc call-with-current-continuation) + ;; for a list of (v1 v2 ... vn) ;; runs (f vn (... (f v2 (f v1 start)))) (define (foldl f start l)