new expression parser with explicit stack for error handling

This commit is contained in:
Peter McGoron 2024-06-23 00:30:44 -04:00
parent 0647b65712
commit b897cc6d25
2 changed files with 480 additions and 173 deletions

589
main.c
View File

@ -49,21 +49,20 @@ enum token_type {
TOKEN_NUM
};
#if 0
static const char *token2string[TOKEN_NUM] = {
"EOF",
static const char *token2string_repr[TOKEN_NUM] = {
"'EOF",
"(",
")",
"'",
"`",
",",
",@",
"IDENT",
"NUMBER_TOK",
"STRING_TOK",
"."
"quote",
"quasiquote",
"unquote",
"unquote-list",
"'ident",
"'number",
"'float",
"'string-tok",
"'dot"
};
#endif
struct token {
enum token_type typ;
@ -286,6 +285,7 @@ static void alloc_of_type(struct uns_ctr *ctr, int typ)
case FLOAT: fields = 1; break;
case EMPTY_LIST: fields = 0; break;
case LISP_NULL: fields = 0; break;
default: abort();
}
ctr->p = gc.alloc_record(&gc, fields + 1);
@ -324,161 +324,382 @@ static void alloc_symbol_from_cstring(struct uns_ctr *ctr, const char *s, size_t
uns_root_remove(&gc, &str);
}
static void alloc_integer(struct uns_ctr *ctr, long l)
{
void *p;
alloc_of_type(ctr, INTEGER);
enum expr_stack_state {
EXPR_STACK_INITIAL,
EXPR_STACK_QUOTELIKE,
EXPR_STACK_SURROUND_OTHER,
EXPR_STACK_START_LIST,
EXPR_STACK_IN_LIST,
EXPR_STACK_IMPROPER_LIST,
EXPR_STACK_IMPROPER_LIST_END,
EXPR_STACK_INVALID
};
p = gc.alloc(&gc, sizeof(long));
memcpy(p, &l, sizeof(long));
gc.record_set_ptr(ctr->p, 1, p);
}
enum expr_stack_fields {
EXPR_FIELD_NEXT,
EXPR_FIELD_PTR,
EXPR_FIELD_STATE,
EXPR_FIELD_NUM
};
static void alloc_float(struct uns_ctr *ctr, double f)
{
void *p;
alloc_of_type(ctr, FLOAT);
p = gc.alloc(&gc, sizeof(double));
memcpy(p, &f, sizeof(double));
gc.record_set_ptr(ctr->p, 1, p);
}
static int expr_all(FILE *input, struct token *tok, struct uns_ctr *expr);
static int list_expr(FILE *input, struct uns_ctr *expr, struct token *tok)
{
struct uns_ctr in_car = {0};
struct uns_ctr in_cdr = {0};
struct uns_ctr cur_head = {0};
int r = 1;
uns_root_add(&gc, &in_car);
uns_root_add(&gc, &in_cdr);
uns_root_add(&gc, &cur_head);
alloc_of_type(expr, CELL);
cur_head.p = expr->p;
for (;;) {
if (!expr_all(input, tok, &in_car)) {
r = 0;
goto end;
}
gc.record_set_ptr(cur_head.p, 1, in_car.p);
tokenize(input, tok);
if (tok->typ == RPAREN)
break;
alloc_of_type(&in_cdr, CELL);
if (tok->typ == T_DOT) {
tokenize(input, tok);
expr_all(input, tok, &in_cdr);
gc.record_set_ptr(cur_head.p, 2, in_cdr.p);
tokenize(input, tok);
if (tok->typ != RPAREN) {
r = 0;
}
goto end;
}
gc.record_set_ptr(cur_head.p, 2, in_cdr.p);
cur_head.p = in_cdr.p;
}
gc.record_set_ptr(cur_head.p, 2, empty_list.p);
end:
uns_root_remove(&gc, &in_car);
uns_root_remove(&gc, &in_cdr);
uns_root_remove(&gc, &cur_head);
return r;
}
static int surround_expr(FILE *input, struct uns_ctr *expr, struct token *tok, const char *name)
static void expr_stack_push(struct uns_ctr *stack, struct uns_ctr *loc, enum expr_stack_state state)
{
struct uns_ctr tmp = {0};
struct uns_ctr quoted = {0};
int r = 0;
uns_root_add(&gc, &tmp);
uns_root_add(&gc, &quoted);
alloc_of_type(expr, CELL);
alloc_symbol_from_cstring(&tmp, name, strlen(name));
gc.record_set_ptr(expr->p, 1, tmp.p);
alloc_of_type(&tmp, CELL);
gc.record_set_ptr(expr->p, 2, tmp.p);
tokenize(input, tok);
if (expr_all(input, tok, &quoted)) {
r = 1;
gc.record_set_ptr(tmp.p, 1, quoted.p);
gc.record_set_ptr(tmp.p, 2, empty_list.p);
}
tmp.p = stack->p;
stack->p = gc.alloc_record(&gc, EXPR_FIELD_NUM);
gc.record_set_ptr(stack->p, EXPR_FIELD_NEXT, tmp.p);
uns_root_remove(&gc, &tmp);
uns_root_remove(&gc, &quoted);
gc.record_set_ptr(stack->p, EXPR_FIELD_PTR, loc->p);
tmp.p = gc.alloc(&gc, sizeof(state));
memcpy(tmp.p, &state, sizeof(state));
gc.record_set_ptr(stack->p, EXPR_FIELD_STATE, tmp.p);
}
static enum expr_stack_state expr_stack_state(struct uns_ctr *stack)
{
enum expr_stack_state r;
if (!stack->p)
return EXPR_STACK_INVALID;
memcpy(&r, gc.record_get_ptr(stack->p, EXPR_FIELD_STATE),
sizeof(r));
return r;
}
/* expr does not call tokenizer directly: it acts on an input token */
static int expr_all(FILE *input, struct token *tok, struct uns_ctr *expr)
static void expr_stack_change_state(struct uns_ctr *stack, enum expr_stack_state newst)
{
switch (tok->typ) {
case LPAREN:
tokenize(input, tok);
if (tok->typ == RPAREN)
expr->p = empty_list.p;
else
return list_expr(input, expr, tok);
break;
case QUOTE:
return surround_expr(input, expr, tok, "quote");
case QUASIQUOTE:
return surround_expr(input, expr, tok, "quasiquote");
case UNQUOTE:
return surround_expr(input, expr, tok, "unquote");
case UNQUOTE_LIST:
return surround_expr(input, expr, tok, "unquote-list");
case T_IDENT:
alloc_of_type(expr, SYMBOL);
gc.record_set_ptr(expr->p, 1, tok->dat.p);
break;
case T_INT:
alloc_integer(expr, tok->i);
break;
case T_FLOAT:
alloc_float(expr, tok->f);
break;
case T_STRING:
alloc_of_type(expr, STRING);
gc.record_set_ptr(expr->p, 1, tok->dat.p);
break;
case RPAREN: case T_EOF: case TOKEN_NUM: case T_DOT:
return 0;
}
memcpy(gc.record_get_ptr(stack->p, EXPR_FIELD_STATE),
&newst,
sizeof(newst));
}
static void expr_stack_ctr(struct uns_ctr *stack, struct uns_ctr *loc)
{
loc->p = gc.record_get_ptr(stack->p, EXPR_FIELD_PTR);
}
static int expr_stack_pop(struct uns_ctr *stack)
{
if (!stack->p)
return 0;
stack->p = gc.record_get_ptr(stack->p, EXPR_FIELD_NEXT);
return 1;
}
static int read_next(FILE *input, struct uns_ctr *expr)
enum parser_return {
EXPR_PARSE_OK,
EXPR_PARSE_EOF,
EXPR_PARSE_INCOMPLETE,
EXPR_PARSE_EXCESS_RPAREN,
EXPR_PARSE_INVALID_EMPTY_LIST,
EXPR_PARSE_IMPROPER_LIST_OVERFLOW,
EXPR_PARSE_BAD_IMPROPER_LIST,
EXPR_PARSE_BAD_STACK_START,
EXPR_PARSE_BAD_QUOTE,
EXPR_PARSE_INTERNAL_ERROR
};
static int tok_to_type(enum token_type tokt)
{
switch (tokt) {
case T_IDENT: return SYMBOL;
case T_STRING: return STRING;
default: return -1;
}
}
/* Parse expressions using an explicit stack.
* This is feasible to to hand-written and has the advantage that error
* recovery is much easier than recursion (implicit stack).
*
* The stack consists of a LIFO:
* [previous value][container][parser state]
*
* The implementation stores containers on the stack and when a container
* is complete or an atom is read, it is stored in "expr" and the stack
* is modified.
*/
static enum parser_return expr_parse(FILE *input, struct uns_ctr *expr)
{
struct uns_ctr stack = {0};
struct uns_ctr loc = {0};
struct uns_ctr new_cell = {0};
struct token tok = {0};
int r = 0;
int store = 0;
enum parser_return r = EXPR_PARSE_OK;
uns_root_add(&gc, &stack);
uns_root_add(&gc, &loc);
uns_root_add(&gc, &tok.dat);
tokenize(input, &tok);
expr->p = NULL;
if (tok.typ != T_EOF)
r = expr_all(input, &tok, expr);
uns_root_remove(&gc, &tok.dat);
uns_root_add(&gc, &new_cell);
/* Allocate expr->p to be a single pointer. */
expr->p = gc.alloc_record(&gc, 1);
/* Initialize the stack with the final result. */
expr_stack_push(&stack, expr, EXPR_STACK_INITIAL);
while (stack.p) {
tokenize(input, &tok);
switch (tok.typ) {
case LPAREN:
/* Push a new cell onto the stack and parse it later. */
alloc_of_type(expr, CELL);
expr_stack_push(&stack, expr, EXPR_STACK_START_LIST);
break;
case QUOTE: case QUASIQUOTE: case UNQUOTE: case UNQUOTE_LIST:
/* push (token-name <undefined>) onto the stack, and have
* it point to the first cell.
*/
alloc_of_type(&loc, CELL);
alloc_symbol_from_cstring(expr, token2string_repr[tok.typ],
strlen(token2string_repr[tok.typ]));
gc.record_set_ptr(loc.p, 1, expr->p);
alloc_of_type(expr, CELL);
gc.record_set_ptr(loc.p, 2, expr->p);
gc.record_set_ptr(expr->p, 2, empty_list.p);
if (tok.typ == QUOTE || tok.typ == QUASIQUOTE)
expr_stack_push(&stack, &loc, EXPR_STACK_QUOTELIKE);
else
expr_stack_push(&stack, &loc, EXPR_STACK_SURROUND_OTHER);
break;
/* All atoms are allocated into "expr" and will be stored in the
* container after the switch.
*/
case T_IDENT: case T_STRING:
alloc_of_type(expr, tok_to_type(tok.typ));
gc.record_set_ptr(expr->p, 1, tok.dat.p);
store = 1;
break;
case T_INT:
alloc_of_type(expr, INTEGER);
loc.p = gc.alloc(&gc, sizeof(tok.i));
memcpy(loc.p, &tok.i, sizeof(tok.i));
gc.record_set_ptr(expr->p, 1, loc.p);
store = 1;
break;
case T_FLOAT:
alloc_of_type(expr, FLOAT);
loc.p = gc.alloc(&gc, sizeof(tok.f));
memcpy(loc.p, &tok.f, sizeof(tok.f));
gc.record_set_ptr(expr->p, 1, loc.p);
store = 1;
break;
case RPAREN:
switch (expr_stack_state(&stack)) {
case EXPR_STACK_START_LIST:
/* Empty list found. Pop the cell and discard it.
* If we are not in a "surround" state, report error.
* Replace the surround state with an empty list.
*/
expr_stack_pop(&stack);
if (!stack.p) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
if (expr_stack_state(&stack) != EXPR_STACK_QUOTELIKE) {
r = EXPR_PARSE_INVALID_EMPTY_LIST;
goto end;
}
/* Discard quote/quasiquote and store the empty list instead. */
if (!expr_stack_pop(&stack)) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
expr->p = empty_list.p;
store = 1;
break;
case EXPR_STACK_IN_LIST:
/* In a list, there are two things on the stack:
* first cell and last cell.
* Place the empty list in the cdr of the last cell,
* and then discard the last cell. Put the first cell into expr
* and store it in the container in the stack before it.
*/
expr_stack_ctr(&stack, &loc);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
gc.record_set_ptr(loc.p, 2, empty_list.p);
/* FALLTHROUGH */
case EXPR_STACK_IMPROPER_LIST_END:
/* CDR of the list has been filled. Pop the last cell. */
expr_stack_pop(&stack);
if (!stack.p) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
/* Put the first cell in expr, and read it into the next item
* of the stack.
*/
expr_stack_ctr(&stack, expr);
expr_stack_pop(&stack);
store = 1;
break;
case EXPR_STACK_INITIAL:
/* Expression is not balanced. */
r = EXPR_PARSE_EXCESS_RPAREN;
goto end;
case EXPR_STACK_IMPROPER_LIST:
/* (values .) is an invalid expression. */
r = EXPR_PARSE_BAD_IMPROPER_LIST;
goto end;
case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER:
/* ,) ') etc are invalid */
r = EXPR_PARSE_BAD_QUOTE;
goto end;
case EXPR_STACK_INVALID:
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
break;
case T_DOT:
if (expr_stack_state(&stack) != EXPR_STACK_IN_LIST) {
r = EXPR_PARSE_BAD_IMPROPER_LIST;
goto end;
}
expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST);
break;
case T_EOF:
/* If EOF is encountered in the middle of an expression, repor
* that. Otherwise, the parser reads nothing.
*/
if (expr_stack_state(&stack) != EXPR_STACK_INITIAL) {
r = EXPR_PARSE_INCOMPLETE;
goto end;
}
r = EXPR_PARSE_EOF;
goto end;
case TOKEN_NUM:
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
/* If store is true, that means "expr" contains a value to
* be placed into the container at the top of the stack.
*
* This is a loop because some cases need to cascade values up the
* stack without acting on a token. When the loop stops, further
* action requires a new token to be read.
*/
while (store) switch (expr_stack_state(&stack)) {
case EXPR_STACK_INVALID:
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
case EXPR_STACK_START_LIST:
/* The current thing on the stack is the first part of the
* list. Add the expression to car of the cell, and add
* push a pointer to the cell. This is the last cell in
* the list.
*/
expr_stack_ctr(&stack, &loc);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
gc.record_set_ptr(loc.p, 1, expr->p);
expr_stack_push(&stack, &loc, EXPR_STACK_IN_LIST);
store = 0;
break;
case EXPR_STACK_IN_LIST:
/* The pointer on the top of the stack is the end of the
* list. Add a new list to it's cdr, and put expr into
* the car of the new end of the list.
*/
expr_stack_ctr(&stack, &loc);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
alloc_of_type(&new_cell, CELL);
gc.record_set_ptr(loc.p, 2, new_cell.p);
gc.record_set_ptr(new_cell.p, 1, expr->p);
expr_stack_pop(&stack);
expr_stack_push(&stack, &new_cell, EXPR_STACK_IN_LIST);
store = 0;
break;
case EXPR_STACK_IMPROPER_LIST:
/* Add the expression to the cdr of the last element.
* Set the state to expect the end of an improper list.
*/
expr_stack_ctr(&stack, &loc);
gc.record_set_ptr(loc.p, 2, expr->p);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
expr_stack_change_state(&stack, EXPR_STACK_IMPROPER_LIST_END);
store = 0;
break;
case EXPR_STACK_IMPROPER_LIST_END:
r = EXPR_PARSE_IMPROPER_LIST_OVERFLOW;
goto end;
case EXPR_STACK_QUOTELIKE: case EXPR_STACK_SURROUND_OTHER:
/* Expression at the top of the stack is
* (surrouding-thing <undefined>)
* Replace (car (cdr (top-of-stack)) with expr.
*/
expr_stack_ctr(&stack, &loc);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
loc.p = gc.record_get_ptr(loc.p, 2);
if (get_type(loc.p) != CELL) {
r = EXPR_PARSE_INTERNAL_ERROR;
goto end;
}
gc.record_set_ptr(loc.p, 1, expr->p);
/* Get the head of the quoted value again. */
expr_stack_ctr(&stack, expr);
expr_stack_pop(&stack);
/* This value needs to be stored somewhere. Continue looping. */
store = 1;
break;
case EXPR_STACK_INITIAL:
/* Finished parsing an expression at toplevel. */
expr_stack_ctr(&stack, &loc);
gc.record_set_ptr(loc.p, 0, expr->p);
/* Prepare for return. */
expr->p = loc.p;
expr_stack_pop(&stack);
if (stack.p) {
r = EXPR_PARSE_BAD_STACK_START;
goto end;
}
store = 0;
break;
}
}
end:
uns_root_remove(&gc, &stack);
uns_root_remove(&gc, &loc);
uns_root_remove(&gc, &tok.dat);
uns_root_remove(&gc, &new_cell);
return r;
}
@ -580,6 +801,42 @@ static void display(struct uns_ctr *ctr)
}
}
/* Evaluation:
*
* Symbols are looked up for their value in the environment.
* Numbers, strings, etc. are returned as-is.
* Quote returns the list verbatim. Quasiquote runs another evaluation
* procedure.
*
* All function calls evalute the arguments, left to right. If the arguments
* are not a list, then it evaluated.
*
* A tail-call of the form ((__unilambda l body) . args) binds the evaluated
* args to l and jumps to body, without pushing a stack.
*
* Any other tail-call of the form (function . args) will push a new stack
* with the new values passed to the function, the previous stack removed and
* the new stack linked to the previous-previous stack.
*
* A non-tail call will push a new stack with a link to the previous stack.
*
* A lambda that is not evaluated immediately is stored with a link to the
* top stack frame, the binding symbol and the code verbatim.
*
* A tail-call is either in the body of __unilambda or the latter value in
* "if". The repl is not a tail-call position.
*
* If "(current-continuation)" is the argument of a call, then the
* continuation is passed to the argument. The continuation proceeds in the
* body.
*
* If "(current-continuation)" is the body of a lambda, then the stack is
* unwound until the result will be the argument to another lambda.
*
* A continuation is a pointer to the stack frame and the expression to
* evaluate.
*/
int main(void)
{
struct uns_ctr expr = {0};
@ -587,9 +844,41 @@ int main(void)
init_gc();
uns_root_add(&gc, &expr);
while (read_next(stdin, &expr)) {
display(&expr);
printf("\n");
while (!feof(stdin)) {
expr.p = NULL;
switch (expr_parse(stdin, &expr)) {
case EXPR_PARSE_OK:
expr.p = gc.record_get_ptr(expr.p, 0);
display(&expr);
printf("\n");
break;
case EXPR_PARSE_INCOMPLETE:
fprintf(stderr, "EOF before expression was finished\n");
break;
case EXPR_PARSE_EXCESS_RPAREN:
fprintf(stderr, "Unbalanced parentheses\n");
break;
case EXPR_PARSE_INVALID_EMPTY_LIST:
fprintf(stderr, "Invalid syntax for empty list (either '() or `'())\n");
break;
case EXPR_PARSE_BAD_QUOTE:
fprintf(stderr, "Invalid syntax for quotes/unquotes\n");
break;
case EXPR_PARSE_IMPROPER_LIST_OVERFLOW:
fprintf(stderr, "too many values at end of improper list (must be exactly one)\n");
case EXPR_PARSE_BAD_IMPROPER_LIST:
fprintf(stderr, "Bad syntax for improper list (must be (value . value))");
break;
case EXPR_PARSE_INTERNAL_ERROR:
fprintf(stderr, "Bug in implementation\n");
break;
case EXPR_PARSE_BAD_STACK_START:
fprintf(stderr, "Bug: bottom of stack not correct\n");
break;
case EXPR_PARSE_EOF:
fprintf(stderr, "EOF\n");
break;
}
}
uns_root_remove(&gc, &expr);

View File

@ -30,10 +30,10 @@
; __define: define in the environment
; __define-macro: Like __define but for macros
; set!
; if
; __exec: apply an argument to a lambda and return it to the continuation.
; __continue-if: continuation version of if.
;
; Others:
; __uniapply: Apply function to single argument
; <undefined>: A value that can only be passed to __unilambda. It cannot
; be bound to a top-level, or "set!", or added to a list.
; It can also be the parameter of a __unilambda, although
@ -41,6 +41,15 @@
;
; Macros are functions that return an sexpr.
(__define-macro if
(__unilambda l
(__continue-if (car l)
(__unilambda <undefined> (car (cdr l)))
(__unilambda <undefined> (car (cdr (cdr l))))
)
)
)
(__define-macro let
(__unilambda l ; car = args, cdr = body
(if (null? (car args))
@ -48,9 +57,11 @@
; (car args) = ((A a) (B b) ...)
; (car (car args)) = (A a)
; (cdr (car (car args))) = (a)
`((__unilambda ,(car (car (car args)))
(let ,(cdr (car args)) ,@body))
. ,(car (cdr (car (car args))))
`(__exec (__unilambda <undefined> ,(car (cdr (car (car args)))))
<undefined>
(__unilambda ,(car (car (car args)))
(let ,(cdr (car args)) ,@body)
)
)
)
)
@ -58,16 +69,17 @@
(__define-macro let* let)
; if and __unilambda only execute one statement. This uses Flatrate's
; evaluation order (arguments first) and tail-call optimization to simulate
; multiple statements.
; if and __unilambda only execute one statement.
(__define-macro begin body
(let ((first (car body))
(rest (cdr body))
)
(if (null? rest)
first
`((__unilambda <undefined> (begin ,@rest)) . ,first)
`(__exec (__unilambda <undefined> ,first)
<undefined>
(__unilambda <undefined> (begin ,@rest))
)
)
)
)
@ -85,14 +97,17 @@
(begin ,@body)
)
)
(let* ((argval (cons args))
(rest (cdr args))
(arg (cons argval))
(val (cons (cdr argval)))
(if (symbol? args)
`(__unilambda ,args ,@body)
(let* ((argval (cons args))
(rest (cdr args))
(arg (cons argval))
(val (cons (cdr argval)))
)
`(__unilambda ,larg
(let ((,arg ,val)) __bindlambda ,larg ,rest ,@body)
)
`(__unilambda ,larg
(let ((,arg ,val)) __bindlambda ,larg ,rest ,@body)
)
)
)
)
)
@ -118,13 +133,11 @@
)
(__define-macro define-macro
(__unilambda l
(let* ((name-and-args (car l))
(name (car name-and-args))
(args (cdr name-and-args))
(body (cdr l))
(tmpname (gensym))
)
(lambda (name-and-args . body)
(let ((name (car name-and-args))
(args (cdr name-and-args))
(tmpname (gensym))
)
`(__define-macro ,name
(lambda ,args ,@body)
)
@ -189,6 +202,11 @@
;;;;;;;;;;;;;; Standard Library ;;;;;;;;;;;;;;;;;;;
(define (call-with-current-continuation f)
(f (current-continuation))
)
(define call/cc call-with-current-continuation)
;; for a list of (v1 v2 ... vn)
;; runs (f vn (... (f v2 (f v1 start))))
(define (foldl f start l)